From db6a2326ea40065b0af534e4880564efeb52227b Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Tue, 1 Nov 2022 10:06:50 +0800
Subject: [PATCH 01/16] add roc_bert

---
 docs/source/en/model_doc/roc_bert.mdx         |   86 +
 src/transformers/__init__.py                  |   34 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 src/transformers/models/auto/modeling_auto.py |    8 +
 src/transformers/models/roc_bert/__init__.py  |   94 +
 .../models/roc_bert/configuration_roc_bert.py |  144 ++
 .../models/roc_bert/modeling_roc_bert.py      | 1729 +++++++++++++++++
 .../models/roc_bert/tokenization_roc_bert.py  | 1108 +++++++++++
 .../roc_bert/tokenization_roc_bert.py.bak     |  529 +++++
 tests/models/roc_bert/__init__.py             |    0
 tests/models/roc_bert/log.txt                 |   43 +
 .../models/roc_bert/test_modeling_roc_bert.py |  541 ++++++
 .../roc_bert/test_tokenization_roc_bert.py    |  332 ++++
 14 files changed, 4652 insertions(+)
 create mode 100644 docs/source/en/model_doc/roc_bert.mdx
 create mode 100644 src/transformers/models/roc_bert/__init__.py
 create mode 100644 src/transformers/models/roc_bert/configuration_roc_bert.py
 create mode 100644 src/transformers/models/roc_bert/modeling_roc_bert.py
 create mode 100644 src/transformers/models/roc_bert/tokenization_roc_bert.py
 create mode 100644 src/transformers/models/roc_bert/tokenization_roc_bert.py.bak
 create mode 100644 tests/models/roc_bert/__init__.py
 create mode 100644 tests/models/roc_bert/log.txt
 create mode 100644 tests/models/roc_bert/test_modeling_roc_bert.py
 create mode 100644 tests/models/roc_bert/test_tokenization_roc_bert.py
diff --git a/docs/source/en/model_doc/roc_bert.mdx b/docs/source/en/model_doc/roc_bert.mdx
new file mode 100644
index 0000000000000..de5c9ae104e15
--- /dev/null
+++ b/docs/source/en/model_doc/roc_bert.mdx
@@ -0,0 +1,86 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# RocBert
+
+## Overview
+
+The RocBert model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>)  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](<https://huggingface.co/<INSERT YOUR HF USERNAME HERE>). The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+## RocBertConfig
+
+[[autodoc]] RocBertConfig
+
+
+## RocBertTokenizer
+
+[[autodoc]] RocBertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+
+## RocBertTokenizerFast
+
+[[autodoc]] RocBertTokenizerFast
+
+
+## RocBertModel
+
+[[autodoc]] RocBertModel
+    - forward
+
+
+## RocBertForCausalLM
+
+[[autodoc]] RocBertForCausalLM
+    - forward
+
+
+## RocBertForMaskedLM
+
+[[autodoc]] RocBertForMaskedLM
+    - forward
+
+
+## RocBertForSequenceClassification
+
+[[autodoc]] transformers.RocBertForSequenceClassification
+    - forward
+
+## RocBertForMultipleChoice
+
+[[autodoc]] transformers.RocBertForMultipleChoice
+    - forward
+
+
+## RocBertForTokenClassification
+
+[[autodoc]] transformers.RocBertForTokenClassification
+    - forward
+
+
+## RocBertForQuestionAnswering
+
+[[autodoc]] RocBertForQuestionAnswering
+    - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index b30d8f719f3a0..6b434448430bc 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -120,6 +120,7 @@
     ],
     "models": [],
     # Models
+    "models.roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RocBertConfig", "RocBertTokenizer"],
     "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
     "models.auto": [
         "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -561,6 +562,7 @@
     ]
 else:
     # Fast tokenizers structure
+    _import_structure["models.roc_bert"].append("RocBertTokenizerFast")
     _import_structure["models.albert"].append("AlbertTokenizerFast")
     _import_structure["models.bart"].append("BartTokenizerFast")
     _import_structure["models.barthez"].append("BarthezTokenizerFast")
@@ -841,6 +843,22 @@
 
     # PyTorch models structure
 
+    _import_structure["models.roc_bert"].extend(
+        [
+            "ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "RocBertForMaskedLM",
+            "RocBertForCausalLM",
+            "RocBertForMultipleChoice",
+            "RocBertForQuestionAnswering",
+            "RocBertForSequenceClassification",
+            "RocBertForTokenClassification",
+            "RocBertLayer",
+            "RocBertModel",
+            "RocBertPreTrainedModel",
+            "load_tf_weights_in_roc_bert",
+        ]
+    )
+
     _import_structure["models.time_series_transformer"].extend(
         [
             "TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3179,6 +3197,7 @@
         load_tf2_weights_in_pytorch_model,
     )
     from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
+    from .models.roc_bert import ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RocBertConfig, RocBertTokenizer
     from .models.auto import (
         ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CONFIG_MAPPING,
@@ -3577,6 +3596,7 @@
         from .utils.dummy_tokenizers_objects import *
     else:
         # Fast tokenizers imports
+        from .models.roc_bert import RocBertTokenizerFast
         from .models.albert import AlbertTokenizerFast
         from .models.bart import BartTokenizerFast
         from .models.barthez import BarthezTokenizerFast
@@ -3805,6 +3825,20 @@
         from .modeling_utils import PreTrainedModel
 
         # PyTorch model imports
+
+        from .models.roc_bert import (
+            ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RocBertForMaskedLM,
+            RocBertForCausalLM,
+            RocBertForMultipleChoice,
+            RocBertForQuestionAnswering,
+            RocBertForSequenceClassification,
+            RocBertForTokenClassification,
+            RocBertLayer,
+            RocBertModel,
+            RocBertPreTrainedModel,
+            load_tf_weights_in_roc_bert,
+        )
         from .models.albert import (
             ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             AlbertForMaskedLM,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 86a775a1eb2b8..d1428107078b6 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -17,6 +17,7 @@
 # limitations under the License.
 
 from . import (
+    roc_bert,
     albert,
     auto,
     bart,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2973e5574d6a4..fcedd4806ea23 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -29,6 +29,7 @@
 CONFIG_MAPPING_NAMES = OrderedDict(
     [
         # Add configs here
+        ("roc_bert", "RocBertConfig"),
         ("albert", "AlbertConfig"),
         ("bart", "BartConfig"),
         ("beit", "BeitConfig"),
@@ -170,6 +171,7 @@
 CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
     [
         # Add archive maps here)
+        ("roc_bert", "ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bart", "BART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -296,6 +298,7 @@
 MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
+        ("roc_bert", "RocBert"),
         ("albert", "ALBERT"),
         ("bart", "BART"),
         ("barthez", "BARThez"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 3da1dc1790572..3d6f339b1abb1 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -28,6 +28,7 @@
 MODEL_MAPPING_NAMES = OrderedDict(
     [
         # Base model mapping
+        ("roc_bert", "RocBertModel"),
         ("albert", "AlbertModel"),
         ("bart", "BartModel"),
         ("beit", "BeitModel"),
@@ -218,6 +219,7 @@
 MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
     [
         # Model with LM heads mapping
+("roc_bert", "RocBertForMaskedLM"),
         ("albert", "AlbertForMaskedLM"),
         ("bart", "BartForConditionalGeneration"),
         ("bert", "BertForMaskedLM"),
@@ -287,6 +289,7 @@
 MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Causal LM mapping
+        ("roc_bert", "RocBertForCausalLM"),
         ("bart", "BartForCausalLM"),
         ("bert", "BertLMHeadModel"),
         ("bert-generation", "BertGenerationDecoder"),
@@ -421,6 +424,7 @@
 MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Masked LM mapping
+("roc_bert", "RocBertForMaskedLM"),
         ("albert", "AlbertForMaskedLM"),
         ("bart", "BartForConditionalGeneration"),
         ("bert", "BertForMaskedLM"),
@@ -525,6 +529,7 @@
 MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Sequence Classification mapping
+        ("roc_bert", "RocBertForSequenceClassification"),
         ("albert", "AlbertForSequenceClassification"),
         ("bart", "BartForSequenceClassification"),
         ("bert", "BertForSequenceClassification"),
@@ -587,6 +592,7 @@
 MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
     [
         # Model for Question Answering mapping
+        ("roc_bert", "RocBertForQuestionAnswering"),
         ("albert", "AlbertForQuestionAnswering"),
         ("bart", "BartForQuestionAnswering"),
         ("bert", "BertForQuestionAnswering"),
@@ -662,6 +668,7 @@
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Token Classification mapping
+("roc_bert", "RocBertForTokenClassification"),
         ("albert", "AlbertForTokenClassification"),
         ("bert", "BertForTokenClassification"),
         ("big_bird", "BigBirdForTokenClassification"),
@@ -709,6 +716,7 @@
 MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
     [
         # Model for Multiple Choice mapping
+("roc_bert", "RocBertForMultipleChoice"),
         ("albert", "AlbertForMultipleChoice"),
         ("bert", "BertForMultipleChoice"),
         ("big_bird", "BigBirdForMultipleChoice"),
diff --git a/src/transformers/models/roc_bert/__init__.py b/src/transformers/models/roc_bert/__init__.py
new file mode 100644
index 0000000000000..507100b875155
--- /dev/null
+++ b/src/transformers/models/roc_bert/__init__.py
@@ -0,0 +1,94 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import _LazyModule, OptionalDependencyNotAvailable, is_tokenizers_available
+from ...utils import is_torch_available
+
+_import_structure = {
+    "configuration_roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RocBertConfig"],
+    "tokenization_roc_bert": ["RocBertTokenizer"],
+}
+
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_roc_bert_fast"] = ["RocBertTokenizerFast"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_roc_bert"] = [
+        "ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "RocBertForMaskedLM",
+        "RocBertForCausalLM",
+        "RocBertForMultipleChoice",
+        "RocBertForQuestionAnswering",
+        "RocBertForSequenceClassification",
+        "RocBertForTokenClassification",
+        "RocBertLayer",
+        "RocBertModel",
+        "RocBertPreTrainedModel",
+        "load_tf_weights_in_roc_bert",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_roc_bert import ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RocBertConfig
+    from .tokenization_roc_bert import RocBertTokenizer
+
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        raise OptionalDependencyNotAvailable()
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_roc_bert import (
+            ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RocBertForMaskedLM,
+            RocBertForCausalLM,
+            RocBertForMultipleChoice,
+            RocBertForQuestionAnswering,
+            RocBertForSequenceClassification,
+            RocBertForTokenClassification,
+            RocBertLayer,
+            RocBertModel,
+            RocBertPreTrainedModel,
+            load_tf_weights_in_roc_bert,
+        )
+
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/roc_bert/configuration_roc_bert.py b/src/transformers/models/roc_bert/configuration_roc_bert.py
new file mode 100644
index 0000000000000..4c88137cc1e78
--- /dev/null
+++ b/src/transformers/models/roc_bert/configuration_roc_bert.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2022 weiweishi and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" RocBert model configuration """
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+logger = logging.get_logger(__name__)
+
+ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "roc-bert-base-cased": "https://huggingface.co/roc-bert-base-cased/resolve/main/config.json",
+    # See all RocBert models at https://huggingface.co/models?filter=roc_bert
+}
+
+
+class RocBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~RocBertModel`].
+    It is used to instantiate an RocBert model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the RocBert [roc-bert-base-cased](https://huggingface.co/roc-bert-base-cased) architecture.
+
+    Configuration objects inherit from  [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
+    for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the RocBert model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~RocBertModel`] or
+            [`~TFRocBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`~RocBertModel`] or
+            [`~TFRocBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        Example:
+
+    ```python
+    >>> from transformers import RocBertModel, RocBertConfig
+
+    >>> # Initializing a RocBert roc-bert-base-cased style configuration
+    >>> configuration = RocBertConfig()
+
+    >>> # Initializing a model from the roc-bert-base-cased style configuration
+    >>> model = RocBertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+"""
+    model_type = "roc_bert"
+
+    def __init__(
+            self,
+            vocab_size=30522,
+            hidden_size=768,
+            num_hidden_layers=12,
+            num_attention_heads=12,
+            intermediate_size=3072,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=2,
+            initializer_range=0.02,
+            layer_norm_eps=1e-12,
+            use_cache=True,
+            pad_token_id=0,
+            position_embedding_type="absolute",
+            classifier_dropout=None,
+            enable_cls=True,
+            enable_pronunciation=True,
+            enable_shape=True,
+            pronunciation_embed_dim=768,
+            pronunciation_vocab_size=910,
+            shape_embed_dim=512,
+            shape_vocab_size=24858,
+            concat_input=True,
+            **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.enable_cls = enable_cls
+        self.enable_pronunciation = enable_pronunciation
+        self.enable_shape = enable_shape
+        self.pronunciation_embed_dim = pronunciation_embed_dim
+        self.pronunciation_vocab_size = pronunciation_vocab_size
+        self.shape_embed_dim = shape_embed_dim
+        self.shape_vocab_size = shape_vocab_size
+        self.concat_input = concat_input
+        self.position_embedding_type = position_embedding_type
+        self.classifier_dropout = classifier_dropout
+        super().__init__(
+            pad_token_id=pad_token_id,
+            **kwargs
+        )
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
new file mode 100644
index 0000000000000..825509a9315bd
--- /dev/null
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -0,0 +1,1729 @@
+# coding=utf-8
+# Copyright 2022 weiweishi The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch RocBert model. """
+
+import math
+import os
+from typing import Optional, Tuple, Union, List
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from .configuration_roc_bert import RocBertConfig
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...utils import logging
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "roc-bert-base-cased"
+_CONFIG_FOR_DOC = "RocBertConfig"
+_TOKENIZER_FOR_DOC = "RocBertTokenizer"
+
+ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "roc-bert-base-cased",
+    # See all RocBert models at https://huggingface.co/models?filter=roc_bert
+]
+
+
+def load_tf_weights_in_roc_bert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+                n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+                for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            assert (
+                    pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class RocBertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position, shape, pronunciation and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.pronunciation_embed = nn.Embedding(config.pronunciation_vocab_size, config.pronunciation_embed_dim,
+                                                padding_idx=config.pad_token_id)
+        self.shape_embed = nn.Embedding(config.shape_vocab_size, config.shape_embed_dim,
+                                        padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        self.enable_pronunciation = config.enable_pronunciation
+        self.enable_shape = config.enable_shape
+
+        if config.concat_input:
+            input_dim = config.hidden_size
+            if self.enable_pronunciation:
+                pronunciation_dim = config.pronunciation_embed_dim
+                input_dim += pronunciation_dim
+            if self.enable_shape:
+                shape_dim = config.shape_embed_dim
+                input_dim += shape_dim
+            self.map_inputs_layer = torch.nn.Linear(input_dim, config.hidden_size)
+        else:
+            self.map_inputs_layer = None
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "token_type_ids",
+            torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
+            persistent=False,
+        )
+
+    def forward(
+            self, input_ids=None, input_shape_ids=None, input_pronunciation_ids=None, token_type_ids=None,
+            position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length: seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if self.map_inputs_layer is None:
+            if inputs_embeds is None:
+                inputs_embeds = self.word_embeddings(input_ids)
+            token_type_embeddings = self.token_type_embeddings(token_type_ids)
+            embeddings = inputs_embeds + token_type_embeddings
+            if self.position_embedding_type == "absolute":
+                position_embeddings = self.position_embeddings(position_ids)
+                embeddings += position_embeddings
+            embeddings = self.LayerNorm(embeddings)
+            embeddings = self.dropout(embeddings)
+
+            denominator = 1
+            embedding_in = torch.clone(embeddings)
+            if self.enable_shape and input_shape_ids is not None:
+                embedding_shape = self.shape_embed(input_shape_ids)
+                embedding_in += embedding_shape
+                denominator += 1
+            if self.enable_pronunciation and input_pronunciation_ids is not None:
+                embedding_pronunciation = self.pronunciation_embed(input_pronunciation_ids)
+                embedding_in += embedding_pronunciation
+                denominator += 1
+
+            embedding_in /= denominator
+            return embedding_in
+        else:
+            if inputs_embeds is None:
+                inputs_embeds = self.word_embeddings(input_ids)  # embedding_word
+            device = inputs_embeds.device
+
+            embedding_in = torch.clone(inputs_embeds)
+            if self.enable_shape:
+                if input_shape_ids is None:
+                    input_shape_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+                embedding_shape = self.shape_embed(input_shape_ids)
+                embedding_in = torch.cat((embedding_in, embedding_shape), -1)
+            if self.enable_pronunciation:
+                if input_pronunciation_ids is None:
+                    input_pronunciation_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+                embedding_pronunciation = self.pronunciation_embed(input_pronunciation_ids)
+                embedding_in = torch.cat((embedding_in, embedding_pronunciation), -1)
+
+            embedding_in = self.map_inputs_layer(embedding_in)  # batch_size * seq_len * hidden_dim
+
+            token_type_embeddings = self.token_type_embeddings(token_type_ids)
+            embedding_in += token_type_embeddings
+            if self.position_embedding_type == "absolute":
+                position_embeddings = self.position_embeddings(position_ids)
+                embedding_in += position_embeddings
+
+            embedding_in = self.LayerNorm(embedding_in)
+            embedding_in = self.dropout(embedding_in)
+            return embedding_in
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->RocBert
+class RocBertSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=None,
+            output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in RocBertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->RocBert
+class RocBertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->RocBert
+class RocBertAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = RocBertSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = RocBertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=None,
+            output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->RocBert
+class RocBertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->RocBert
+class RocBertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->RocBert
+class RocBertLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = RocBertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
+            self.crossattention = RocBertAttention(config, position_embedding_type="absolute")
+        self.intermediate = RocBertIntermediate(config)
+        self.output = RocBertOutput(config)
+
+    def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=None,
+            output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            assert hasattr(
+                self, "crossattention"
+            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->RocBert
+class RocBertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([RocBertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_values=None,
+            use_cache=None,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->RocBert
+class RocBertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->RocBert
+class RocBertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->RocBert
+class RocBertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = RocBertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->RocBert
+class RocBertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = RocBertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class RocBertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    config_class = RocBertConfig
+    load_tf_weights = load_tf_weights_in_roc_bert
+    base_model_prefix = "roc_bert"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, RocBertEncoder):
+            module.gradient_checkpointing = value
+
+
+ROC_BERT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    usage and behavior.
+
+    Parameters:
+        config ([`~RocBertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ROC_BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`RocBertTokenizer`].
+            See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        input_shape_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the shape vocabulary.
+
+            Indices can be obtained using [`RocBertTokenizer`].
+            See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        input_pronunciation_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the pronunciation vocabulary.
+
+            Indices can be obtained using [`RocBertTokenizer`].
+            See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range `[0, config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare RocBert Model transformer outputting raw hidden-states without any specific head on top.",
+    ROC_BERT_START_DOCSTRING,
+)
+class RocBertModel(RocBertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well
+    as a decoder, in which case a layer of cross-attention is added between
+    the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani,
+    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    `is_decoder` argument of the configuration set to `True`.
+    To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
+    argument and `add_cross_attention` set to `True`; an
+    `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->RocBert
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = RocBertEmbeddings(config)
+        self.encoder = RocBertEncoder(config)
+
+        self.pooler = RocBertPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def get_pronunciation_embeddings(self):
+        return self.embeddings.pronunciation_embed
+
+    def set_pronunciation_embeddings(self, value):
+        self.embeddings.pronunciation_embed = value
+
+    def get_shape_embeddings(self):
+        return self.embeddings.shape_embed
+
+    def set_shape_embeddings(self, value):
+        self.embeddings.shape_embed = value
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            input_shape_ids: Optional[torch.Tensor] = None,
+            input_pronunciation_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+            if the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            is used in the cross-attention if the model is configured as a decoder.
+            Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+            instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+            decoding (see `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings("""RocBert Model with a `language modeling` head on top. """, ROC_BERT_START_DOCSTRING)
+class RocBertForMaskedLM(RocBertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->RocBert,bert->roc_bert
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `RocBertForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.roc_bert = RocBertModel(config)
+        self.cls = RocBertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            input_shape_ids: Optional[torch.Tensor] = None,
+            input_pronunciation_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss.
+            Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring)
+            Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels
+            in `[0, ..., config.vocab_size]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roc_bert(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, input_shape_ids=None, input_pronunciation_ids=None,
+                                      attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+        if input_shape_ids is not None:
+            input_shape_ids = torch.cat([input_shape_ids, dummy_token], dim=1)
+        if input_pronunciation_ids is not None:
+            input_pronunciation_ids = torch.cat([input_pronunciation_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "input_shape_ids": input_shape_ids,
+                "input_pronunciation_ids": input_pronunciation_ids, "attention_mask": attention_mask}
+
+
+@add_start_docstrings(
+    """RocBert Model with a `language modeling` head on top for CLM fine-tuning. """, ROC_BERT_START_DOCSTRING
+)
+class RocBertForCausalLM(RocBertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->RocBertForCausalLM,Bert->RocBert,bert->roc_bert
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `RocBertForCausalLM` as a standalone, add `is_decoder=True.`")
+
+        self.roc_bert = RocBertModel(config)
+        self.cls = RocBertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            input_shape_ids: Optional[torch.Tensor] = None,
+            input_pronunciation_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[List[torch.Tensor]] = None,
+            labels: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+            tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+            tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
+            additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
+            model.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+            cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
+            decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+            instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+            decoding (see `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import RocBertTokenizer, RocBertForCausalLM, RocBertConfig
+        >>> import torch
+
+        >>> tokenizer = RocBertTokenizer.from_pretrained('roc-bert-base-cased')
+        >>> config = RocBertConfig.from_pretrained("roc-bert-base-cased")
+        >>> config.is_decoder = True
+        >>> model = RocBertForCausalLM.from_pretrained('roc-bert-base-cased', config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```
+"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roc_bert(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, input_shape_ids=None, input_pronunciation_ids=None, past=None,
+                                      attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+            if input_shape_ids is not None:
+                input_shape_ids = input_shape_ids[:, -1:]
+            if input_pronunciation_ids is not None:
+                input_pronunciation_ids = input_pronunciation_ids[:, -1:]
+
+        return {"input_ids": input_ids, "input_shape_ids": input_shape_ids,
+                "input_pronunciation_ids": input_pronunciation_ids, "attention_mask": attention_mask,
+                "past_key_values": past}
+
+    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel._reorder_cache
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (
+            tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],)
+        return reordered_past
+
+
+@add_start_docstrings(
+    """RocBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    ROC_BERT_START_DOCSTRING,
+)
+class RocBertForSequenceClassification(RocBertPreTrainedModel):
+    # Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification.__init__ with Bert->RocBert,bert->roc_bert
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.roc_bert = RocBertModel(config)
+
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            input_shape_ids: Optional[torch.Tensor] = None,
+            input_pronunciation_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in `[0, ..., config.num_labels - 1]`.
+            If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roc_bert(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """RocBert Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    ROC_BERT_START_DOCSTRING,
+)
+class RocBertForMultipleChoice(RocBertPreTrainedModel):
+    # Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice.__init__ with Bert->RocBert,bert->roc_bert
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.roc_bert = RocBertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            input_shape_ids: Optional[torch.Tensor] = None,
+            input_pronunciation_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension
+            of the input tensors. (See `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        input_shape_ids = input_shape_ids.view(-1, input_shape_ids.size(-1)) if input_shape_ids is not None else None
+        input_pronunciation_ids = input_pronunciation_ids.view(-1, input_pronunciation_ids.size(
+            -1)) if input_pronunciation_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.roc_bert(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """RocBert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    ROC_BERT_START_DOCSTRING,
+)
+class RocBertForTokenClassification(RocBertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->Ernie,bert->ernie
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roc_bert = RocBertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            input_shape_ids: Optional[torch.Tensor] = None,
+            input_pronunciation_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss.
+            Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roc_bert(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """RocBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
+    ROC_BERT_START_DOCSTRING,
+)
+class RocBertForQuestionAnswering(RocBertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->Ernie,bert->ernie
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roc_bert = RocBertModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            input_shape_ids: Optional[torch.Tensor] = None,
+            input_pronunciation_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            start_positions: Optional[torch.Tensor] = None,
+            end_positions: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roc_bert(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
new file mode 100644
index 0000000000000..633e9fab845e0
--- /dev/null
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -0,0 +1,1108 @@
+# coding=utf-8
+# Copyright 2022 weiweishi and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RocBert."""
+
+import collections
+import itertools
+import json
+import os
+import unicodedata
+from typing import List, Optional, Union, Dict, Tuple
+
+from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ...tokenization_utils_base import (
+    TextInput,
+    PreTokenizedInput,
+    EncodedInput,
+    PaddingStrategy,
+    TensorType,
+    TruncationStrategy,
+    BatchEncoding,
+    ENCODE_KWARGS_DOCSTRING,
+    ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
+    TextInputPair,
+    EncodedInputPair,
+    PreTokenizedInputPair,
+)
+from ...utils import logging, add_end_docstrings
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.txt",
+    "word_shape_file": "word_shape.json",
+    "word_pronunciation_file": "word_pronunciation.json"
+}
+
+# todo: change the path
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {"roc-bert-base-uncased": "/data/git_code/wit/tmp/vocab.txt"},
+    "word_shape_file": {"roc-bert-base-uncased": "/data/git_code/wit/tmp/word_shape.json"},
+    "word_pronunciation_file": {"roc-bert-base-uncased": "/data/git_code/wit/tmp/word_shape.json"},
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "roc-bert-base-uncased": 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "roc-bert-base-uncased": {"do_lower_case": True},
+}
+
+
+# Copied from transformers.models.bert.tokenization_bert.load_vocab
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+# Copied from transformers.models.bert.tokenization_bert.load_vocab
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class RocBertTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a RocBERT tokenizer. Based on WordPiece.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        word_shape_file (`str`):
+            File containing the word => shape info.
+        word_pronunciation_file (`str`):
+            File containing the word => shape info.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    # model_input_names: List[str] = ["input_ids", "input_shape_ids", "input_pronunciation_ids",
+    #                                 "token_type_ids", "attention_mask"]
+
+    def __init__(
+            self,
+            vocab_file,
+            word_shape_file,
+            word_pronunciation_file,
+            do_lower_case=True,
+            do_basic_tokenize=True,
+            never_split=None,
+            unk_token="[UNK]",
+            sep_token="[SEP]",
+            pad_token="[PAD]",
+            cls_token="[CLS]",
+            mask_token="[MASK]",
+            tokenize_chinese_chars=True,
+            strip_accents=None,
+            **kwargs
+    ):
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        for cur_file in [vocab_file, word_shape_file, word_pronunciation_file]:
+            if cur_file is None or not os.path.isfile(cur_file):
+                raise ValueError(
+                    f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google "
+                    "pretrained model use `tokenizer = RocBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+                )
+
+        self.vocab = load_vocab(vocab_file)
+
+        with open(word_shape_file, "r", encoding="utf8") as in_file:
+            self.word_shape = json.load(in_file)
+
+        with open(word_pronunciation_file, "r", encoding="utf8") as in_file:
+            self.word_pronunciation = json.load(in_file)
+
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = RocBertBasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = RocBertWordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.vocab_size
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_vocab
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._tokenize
+    def _tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _encode_plus(
+            self,
+            text: Union[TextInput, PreTokenizedInput, EncodedInput],
+            text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+            add_special_tokens: bool = True,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+            max_length: Optional[int] = None,
+            stride: int = 0,
+            is_split_into_words: bool = False,
+            pad_to_multiple_of: Optional[int] = None,
+            return_tensors: Optional[Union[str, TensorType]] = None,
+            return_token_type_ids: Optional[bool] = None,
+            return_attention_mask: Optional[bool] = None,
+            return_overflowing_tokens: bool = False,
+            return_special_tokens_mask: bool = False,
+            return_offsets_mapping: bool = False,
+            return_length: bool = False,
+            verbose: bool = True,
+            **kwargs
+    ) -> BatchEncoding:
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text, **kwargs)
+                tokens_ids = self.convert_tokens_to_ids(tokens)
+                tokens_shape_ids = self.convert_tokens_to_shape_ids(tokens)
+                tokens_proun_ids = self.convert_tokens_to_pronunciation_ids(tokens)
+                return tokens_ids, tokens_shape_ids, tokens_proun_ids
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                if is_split_into_words:
+                    tokens = list(
+                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
+                    )
+                    tokens_ids = self.convert_tokens_to_ids(tokens)
+                    tokens_shape_ids = self.convert_tokens_to_shape_ids(tokens)
+                    tokens_proun_ids = self.convert_tokens_to_pronunciation_ids(tokens)
+                    return tokens_ids, tokens_shape_ids, tokens_proun_ids
+                else:
+                    tokens_ids = self.convert_tokens_to_ids(text)
+                    tokens_shape_ids = self.convert_tokens_to_shape_ids(text)
+                    tokens_proun_ids = self.convert_tokens_to_pronunciation_ids(text)
+                    return tokens_ids, tokens_shape_ids, tokens_proun_ids
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text, [0] * len(text), [0] * len(text)  # shape and proun id is pad_value
+            else:
+                if is_split_into_words:
+                    raise ValueError(
+                        f"Input {text} is not valid. Should be a string or a list/tuple of strings when"
+                        " `is_split_into_words=True`."
+                    )
+                else:
+                    raise ValueError(
+                        f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of"
+                        " integers."
+                    )
+
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast. "
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        first_ids, first_shape_ids, first_proun_ids = get_input_ids(text)
+        if text_pair is not None:
+            second_ids, second_shape_ids, second_proun_ids = get_input_ids(text_pair)
+        else:
+            second_ids, second_shape_ids, second_proun_ids = None, None, None
+
+        return self.prepare_for_model(
+            first_ids,
+            first_shape_ids,
+            first_proun_ids,
+            pair_ids=second_ids,
+            pair_shape_ids=second_shape_ids,
+            pair_pronunciation_ids=second_proun_ids,
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def prepare_for_model(
+            self,
+            ids: List[int],
+            shape_ids: List[int],
+            pronunciation_ids: List[int],
+            pair_ids: Optional[List[int]] = None,
+            pair_shape_ids: Optional[List[int]] = None,
+            pair_pronunciation_ids: Optional[List[int]] = None,
+            add_special_tokens: bool = True,
+            padding: Union[bool, str, PaddingStrategy] = False,
+            truncation: Union[bool, str, TruncationStrategy] = None,
+            max_length: Optional[int] = None,
+            stride: int = 0,
+            pad_to_multiple_of: Optional[int] = None,
+            return_tensors: Optional[Union[str, TensorType]] = None,
+            return_token_type_ids: Optional[bool] = None,
+            return_attention_mask: Optional[bool] = None,
+            return_overflowing_tokens: bool = False,
+            return_special_tokens_mask: bool = False,
+            return_offsets_mapping: bool = False,
+            return_length: bool = False,
+            verbose: bool = True,
+            prepend_batch_axis: bool = False,
+            **kwargs
+    ) -> BatchEncoding:
+        """
+                Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
+                adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
+                manages a moving window (with user defined stride) for overflowing tokens. Please Note, for *pair_ids*
+                different than `None` and *truncation_strategy = longest_first* or `True`, it is not possible to return
+                overflowing tokens. Such a combination of arguments will raise an error.
+
+                Args:
+                    ids (`List[int]`):
+                        Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
+                        `convert_tokens_to_id` methods.
+                    shape_ids (`List[int]`):
+                        Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
+                        `convert_token_to_shape_id` methods.
+                    pronunciation_ids (`List[int]`):
+                        Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
+                        `convert_token_to_pronunciation_id` methods.
+                    pair_ids (`List[int]`, *optional*):
+                        Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                        and `convert_tokens_to_id` methods.
+                    pair_shape_ids (`List[int]`, *optional*):
+                        Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                        and `convert_token_to_shape_id` methods.
+                    pair_pronunciation_ids (`List[int]`, *optional*):
+                        Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                        and `convert_token_to_pronunciation_id` methods.
+                """
+
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        pair = bool(pair_ids is not None)
+        len_ids = len(ids)
+        len_pair_ids = len(pair_ids) if pair else 0
+
+        if return_token_type_ids and not add_special_tokens:
+            raise ValueError(
+                "Asking to return token_type_ids while setting add_special_tokens to False "
+                "results in an undefined behavior. Please set add_special_tokens to True or "
+                "set return_token_type_ids to None."
+            )
+
+        if (
+                return_overflowing_tokens
+                and truncation_strategy == TruncationStrategy.LONGEST_FIRST
+                and pair_ids is not None
+        ):
+            raise ValueError(
+                "Not possible to return overflowing tokens for pair of sequences with the "
+                "`longest_first`. Please select another truncation strategy than `longest_first`, "
+                "for instance `only_second` or `only_first`."
+            )
+
+        # Load from model defaults
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        encoded_inputs = {}
+
+        # Compute the total size of the returned encodings
+        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
+
+        # Truncation: Handle max sequence length
+        overflowing_tokens = []
+        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
+            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
+                ids,
+                pair_ids=pair_ids,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+            shape_ids, pair_shape_ids, _ = self.truncate_sequences(
+                shape_ids,
+                pair_ids=pair_shape_ids,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+            pronunciation_ids, pair_pronunciation_ids, _ = self.truncate_sequences(
+                pronunciation_ids,
+                pair_ids=pair_pronunciation_ids,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+
+        if return_overflowing_tokens:
+            encoded_inputs["overflowing_tokens"] = overflowing_tokens
+            encoded_inputs["num_truncated_tokens"] = total_len - max_length
+
+        # Add special tokens
+        if add_special_tokens:
+            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+            input_shape_ids = self.build_inputs_with_special_tokens(shape_ids, pair_shape_ids,
+                                                                    self.word_shape["[UNK]"], self.word_shape["[UNK]"])
+            input_pronunciation_ids = self.build_inputs_with_special_tokens(pronunciation_ids, pair_pronunciation_ids,
+                                                                            self.word_pronunciation["[UNK]"],
+                                                                            self.word_pronunciation["[UNK]"])
+        else:
+            sequence = ids + pair_ids if pair_ids else ids
+            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair_ids else [])
+            input_shape_ids = shape_ids + pair_shape_ids if pair_shape_ids else shape_ids
+            input_pronunciation_ids = pronunciation_ids + pair_pronunciation_ids if pair_pronunciation_ids else pronunciation_ids
+
+        # Build output dictionary
+        encoded_inputs["input_ids"] = sequence
+        encoded_inputs["input_shape_ids"] = input_shape_ids
+        encoded_inputs["input_pronunciation_ids"] = input_pronunciation_ids
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = token_type_ids
+        if return_special_tokens_mask:
+            if add_special_tokens:
+                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+            else:
+                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
+
+        # Check lengths
+        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
+
+        # Padding
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
+            encoded_inputs = self.pad(
+                encoded_inputs,
+                max_length=max_length,
+                padding=padding_strategy.value,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+            )
+
+        if return_length:
+            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
+
+        batch_outputs = BatchEncoding(
+            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
+        )
+
+        return batch_outputs
+
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if return_attention_mask and "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * len(required_input)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if self.padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = (
+                            encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                for key in ["input_shape_ids", "input_pronunciation_ids"]:
+                    if key in encoded_inputs:
+                        encoded_inputs[key] = encoded_inputs[key] + [self.pad_token_id] * difference
+                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
+            elif self.padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                for key in ["input_shape_ids", "input_pronunciation_ids"]:
+                    if key in encoded_inputs:
+                        encoded_inputs[key] = [self.pad_token_id] * difference + encoded_inputs[key]
+                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+            else:
+                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+
+        return encoded_inputs
+
+    def _batch_encode_plus(
+            self,
+            batch_text_or_text_pairs: Union[
+                List[TextInput],
+                List[TextInputPair],
+                List[PreTokenizedInput],
+                List[PreTokenizedInputPair],
+                List[EncodedInput],
+                List[EncodedInputPair],
+            ],
+            add_special_tokens: bool = True,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+            max_length: Optional[int] = None,
+            stride: int = 0,
+            is_split_into_words: bool = False,
+            pad_to_multiple_of: Optional[int] = None,
+            return_tensors: Optional[Union[str, TensorType]] = None,
+            return_token_type_ids: Optional[bool] = None,
+            return_attention_mask: Optional[bool] = None,
+            return_overflowing_tokens: bool = False,
+            return_special_tokens_mask: bool = False,
+            return_offsets_mapping: bool = False,
+            return_length: bool = False,
+            verbose: bool = True,
+            **kwargs
+    ) -> BatchEncoding:
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text, **kwargs)
+                tokens_ids = self.convert_tokens_to_ids(tokens)
+                tokens_shape_ids = self.convert_tokens_to_shape_ids(tokens)
+                tokens_proun_ids = self.convert_tokens_to_pronunciation_ids(tokens)
+                return tokens_ids, tokens_shape_ids, tokens_proun_ids
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                if is_split_into_words:
+                    tokens = list(
+                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
+                    )
+                    tokens_ids = self.convert_tokens_to_ids(tokens)
+                    tokens_shape_ids = self.convert_tokens_to_shape_ids(tokens)
+                    tokens_proun_ids = self.convert_tokens_to_pronunciation_ids(tokens)
+                    return tokens_ids, tokens_shape_ids, tokens_proun_ids
+                else:
+                    tokens_ids = self.convert_tokens_to_ids(text)
+                    tokens_shape_ids = self.convert_tokens_to_shape_ids(text)
+                    tokens_proun_ids = self.convert_tokens_to_pronunciation_ids(text)
+                    return tokens_ids, tokens_shape_ids, tokens_proun_ids
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text
+            else:
+                raise ValueError(
+                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                )
+
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+            )
+
+        input_ids = []
+        input_shape_ids = []
+        input_pronunciation_ids = []
+        for ids_or_pair_ids in batch_text_or_text_pairs:
+            if not isinstance(ids_or_pair_ids, (list, tuple)):
+                ids, pair_ids = ids_or_pair_ids, None
+            elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
+                ids, pair_ids = ids_or_pair_ids, None
+            else:
+                ids, pair_ids = ids_or_pair_ids
+
+            first_ids, first_shape_ids, first_proun_ids = get_input_ids(ids)
+            if pair_ids is not None:
+                second_ids, second_shape_ids, second_proun_ids = get_input_ids(pair_ids)
+            else:
+                second_ids, second_shape_ids, second_proun_ids = None, None, None
+
+            input_ids.append((first_ids, second_ids))
+            input_shape_ids.append((first_shape_ids, second_shape_ids))
+            input_pronunciation_ids.append((first_proun_ids, second_proun_ids))
+
+        batch_outputs = self._batch_prepare_for_model(
+            input_ids,
+            batch_shape_ids_pairs=input_shape_ids,
+            batch_pronunciation_ids_pairs=input_pronunciation_ids,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+
+        return BatchEncoding(batch_outputs)
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def _batch_prepare_for_model(
+            self,
+            batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
+            batch_shape_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
+            batch_pronunciation_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
+            add_special_tokens: bool = True,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+            max_length: Optional[int] = None,
+            stride: int = 0,
+            pad_to_multiple_of: Optional[int] = None,
+            return_tensors: Optional[str] = None,
+            return_token_type_ids: Optional[bool] = None,
+            return_attention_mask: Optional[bool] = None,
+            return_overflowing_tokens: bool = False,
+            return_special_tokens_mask: bool = False,
+            return_length: bool = False,
+            verbose: bool = True,
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
+        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
+        manages a moving window (with user defined stride) for overflowing tokens
+
+        Args:
+            batch_ids_pairs: list of tokenized input ids or input ids pairs
+            batch_shape_ids_pairs: list of tokenized input shape ids or input shape ids pairs
+            batch_pronunciation_ids_pairs: list of tokenized input pronunciation ids or input pronunciation ids pairs
+        """
+
+        batch_outputs = {}
+        for i, (first_ids, second_ids) in enumerate(batch_ids_pairs):
+            first_shape_ids, second_shape_ids = batch_shape_ids_pairs[i]
+            first_pronunciation_ids, second_pronunciation_ids = batch_pronunciation_ids_pairs[i]
+            outputs = self.prepare_for_model(
+                first_ids,
+                first_shape_ids,
+                first_pronunciation_ids,
+                pair_ids=second_ids,
+                pair_shape_ids=second_shape_ids,
+                pair_pronunciation_ids=second_pronunciation_ids,
+                add_special_tokens=add_special_tokens,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
+                truncation=truncation_strategy.value,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=None,  # we pad in batch afterward
+                return_attention_mask=False,  # we pad in batch afterward
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_length=return_length,
+                return_tensors=None,  # We convert the whole batch to tensors at the end
+                prepend_batch_axis=False,
+                verbose=verbose,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+
+        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+        return batch_outputs
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_token_to_shape_id(self, token):
+        """Converts a token (str) in an shape_id using the shape vocab."""
+        return self.word_shape.get(token, self.word_shape.get(self.unk_token))
+
+    def convert_tokens_to_shape_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        if tokens is None:
+            return None
+
+        ids = []
+        for token in tokens:
+            ids.append(self._convert_token_to_shape_id(token))
+        return ids
+
+    def _convert_token_to_pronunciation_id(self, token):
+        """Converts a token (str) in an shape_id using the shape vocab."""
+        return self.word_pronunciation.get(token, self.word_pronunciation.get(self.unk_token))
+
+    def convert_tokens_to_pronunciation_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        if tokens is None:
+            return None
+
+        ids = []
+        for token in tokens:
+            ids.append(self._convert_token_to_pronunciation_id(token))
+        return ids
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_tokens_to_string
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
+            cls_token_id: int = None, sep_token_id: int = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        cls = [self.cls_token_id] if cls_token_id is None else [cls_token_id]
+        sep = [self.sep_token_id] if sep_token_id is None else [sep_token_id]
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
+            already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
+        pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str, str, str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory,
+                (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
+            )
+            word_shape_file = os.path.join(
+                save_directory,
+                (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["word_shape_file"]
+            )
+            word_pronunciation_file = os.path.join(
+                save_directory,
+                (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["word_pronunciation_file"]
+            )
+        else:
+            raise ValueError(
+                f"Can't find a directory at path '{save_directory}'. To load the vocabulary from a Google "
+                "pretrained model use `tokenizer = RocBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+
+        with open(word_shape_file, "w", encoding="utf8") as writer:
+            json.dump(self.word_shape, writer, ensure_ascii=False, indent=4, separators=(', ', ': '))
+
+        with open(word_pronunciation_file, "w", encoding="utf8") as writer:
+            json.dump(self.word_pronunciation, writer, ensure_ascii=False, indent=4, separators=(', ', ': '))
+
+        return (vocab_file, word_shape_file, word_pronunciation_file,)
+
+
+# Copied from  transformers.models.bert.tokenization_bert.BasicTokenizer with BasicTokenizer->RocBertBasicTokenizer
+class RocBertBasicTokenizer(object):
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
+        WordPieceTokenizer.
+
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+                (cp >= 0x4E00 and cp <= 0x9FFF)
+                or (cp >= 0x3400 and cp <= 0x4DBF)  #
+                or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+                or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+                or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+                or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+                or (cp >= 0xF900 and cp <= 0xFAFF)
+                or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+# Copied from  transformers.models.bert.tokenization_bert.WordpieceTokenizer with WordpieceTokenizer->RocBertWordpieceTokenizer
+class RocBertWordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+
+        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py.bak b/src/transformers/models/roc_bert/tokenization_roc_bert.py.bak
new file mode 100644
index 0000000000000..5c43364d0d21d
--- /dev/null
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py.bak
@@ -0,0 +1,529 @@
+# coding=utf-8
+# Copyright 2022 weiweishi and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RocBert."""
+
+import collections
+import json
+import os
+from typing import List, Optional, Union, Dict, Tuple
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils_base import TextInput, PreTokenizedInput, EncodedInput, PaddingStrategy, \
+    TensorType, TruncationStrategy, BatchEncoding
+from ...utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.txt",
+    "word_shape_file": "word_shape.json",
+    "word_pronunciation_file": "word_pronunciation.json"
+}
+
+# todo: change the path
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {"roc-bert-base-uncased": "/data/git_code/wit/tmp/vocab.txt"},
+    "word_shape_file": {"roc-bert-base-uncased": "/data/git_code/wit/tmp/word_shape.json"},
+    "word_pronunciation_file": {"roc-bert-base-uncased": "/data/git_code/wit/tmp/word_shape.json"},
+}
+
+# Copied from transformers.models.bert.tokenization_bert.load_vocab
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+class RocBertTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a RocBERT tokenizer. Based on WordPiece.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        word_shape_file (`str`):
+            File containing the word => shape info.
+        word_pronunciation_file (`str`):
+            File containing the word => shape info.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    model_input_names: List[str] = ["input_ids", "input_shape_ids", "input_pronunciation_ids",
+                                    "token_type_ids", "attention_mask"]
+
+    def __init__(
+            self,
+            vocab_file,
+            word_shape_file,
+            word_pronunciation_file,
+            do_lower_case=True,
+            do_basic_tokenize=True,
+            never_split=None,
+            unk_token="[UNK]",
+            sep_token="[SEP]",
+            pad_token="[PAD]",
+            cls_token="[CLS]",
+            mask_token="[MASK]",
+            tokenize_chinese_chars=True,
+            strip_accents=None,
+            **kwargs
+    ):
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        for cur_file in [vocab_file, word_shape_file, word_pronunciation_file]:
+            if not os.path.isfile(cur_file):
+                raise ValueError(
+                    f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google "
+                    "pretrained model use `tokenizer = RocBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+                )
+
+        self.vocab = load_vocab(vocab_file)
+
+        with open(word_shape_file, "r", encoding="utf8") as in_file:
+            self.word_shape = json.load(in_file)
+
+        with open(word_pronunciation_file, "r", encoding="utf8") as in_file:
+            self.word_pronunciation = json.load(in_file)
+
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def tokenize(self, text) -> List[str]:
+        """
+            Converts a string in a sequence of tokens, using the tokenizer.
+            Split in words for word-based vocabulary
+            Args:
+                text (`str`):
+                    The sequence to be encoded.
+                **kwargs (additional keyword arguments):
+                    Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
+            Returns:
+                `List[str]`: The list of tokens.
+        """
+        output_tokens = list()
+        no_split_token = set(self.unique_no_split_tokens)
+        tokens = self.tokens_trie.split(text)
+        for i, token in enumerate(tokens):
+            if token in no_split_token:
+                output_tokens.append(token)
+            else:
+                for word in token:
+                    output_tokens.append(word)
+        return output_tokens
+
+    def _encode_plus(
+            self,
+            text: Union[TextInput, PreTokenizedInput, EncodedInput],
+            text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+            add_special_tokens: bool = True,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+            max_length: Optional[int] = None,
+            stride: int = 0,
+            is_split_into_words: bool = False,
+            pad_to_multiple_of: Optional[int] = None,
+            return_tensors: Optional[Union[str, TensorType]] = None,
+            return_token_type_ids: Optional[bool] = None,
+            return_attention_mask: Optional[bool] = None,
+            return_overflowing_tokens: bool = False,
+            return_special_tokens_mask: bool = False,
+            return_offsets_mapping: bool = False,
+            return_length: bool = False,
+            verbose: bool = True,
+            **kwargs
+    ) -> BatchEncoding:
+        first_tokens = self.tokenize(text)
+        first_ids = [self._convert_token_to_id(word) for word in first_tokens]
+        first_ids_shape = [self._convert_token_to_shape_id(word) for word in first_tokens]
+        first_ids_proun = [self._convert_token_to_pronunciation_id(word) for word in first_tokens]
+
+        second_tokens = self.tokenize(text_pair) if text_pair is not None else None
+        second_ids = [self._convert_token_to_id(word) for word in second_tokens] if text_pair is not None else None
+        second_ids_shape = [self._convert_token_to_shape_id(word) for word in
+                            second_tokens] if text_pair is not None else None
+        second_ids_proun = [self._convert_token_to_pronunciation_id(word) for word in
+                            second_tokens] if text_pair is not None else None
+
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+        pair = bool(second_ids is not None)
+        len_ids = len(first_ids)
+        len_pair_ids = len(second_ids) if pair else 0
+
+        if return_token_type_ids and not add_special_tokens:
+            raise ValueError(
+                "Asking to return token_type_ids while setting add_special_tokens to False "
+                "results in an undefined behavior. Please set add_special_tokens to True or "
+                "set return_token_type_ids to None."
+            )
+
+        if (
+                return_overflowing_tokens
+                and truncation_strategy == TruncationStrategy.LONGEST_FIRST
+                and second_ids is not None
+        ):
+            raise ValueError(
+                "Not possible to return overflowing tokens for pair of sequences with the "
+                "`longest_first`. Please select another truncation strategy than `longest_first`, "
+                "for instance `only_second` or `only_first`."
+            )
+
+            # Load from model defaults
+
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        encoded_inputs = dict()
+
+        # Compute the total size of the returned encodings
+        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
+
+        # Truncation: Handle max sequence length
+        overflowing_tokens = []
+        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
+            first_ids, second_ids, overflowing_tokens = self.truncate_sequences(
+                first_ids,
+                pair_ids=second_ids,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+            first_ids_shape, second_ids_shape, _ = self.truncate_sequences(
+                first_ids_shape,
+                pair_ids=second_ids_shape,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+            first_ids_proun, second_ids_proun, _ = self.truncate_sequences(
+                first_ids_proun,
+                pair_ids=second_ids_proun,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+
+        if return_overflowing_tokens:
+            encoded_inputs["overflowing_tokens"] = overflowing_tokens
+            encoded_inputs["num_truncated_tokens"] = total_len - max_length
+
+        # Add special tokens
+        if add_special_tokens:
+            input_ids = self.build_inputs_with_special_tokens(first_ids, second_ids)
+            input_shape_ids = self.build_inputs_with_special_tokens(first_ids_shape, second_ids_shape,
+                                                                    self.word_shape["[UNK]"], self.word_shape["[UNK]"])
+            input_pronunciation_ids = self.build_inputs_with_special_tokens(first_ids_proun, second_ids_proun,
+                                                                            self.word_pronunciation["[UNK]"],
+                                                                            self.word_pronunciation["[UNK]"])
+            token_type_ids = self.create_token_type_ids_from_sequences(first_ids, second_ids)
+        else:
+            input_ids = first_ids + second_ids if second_ids else first_ids
+            input_shape_ids = first_ids_shape + second_ids_shape if second_ids_shape else first_ids_shape
+            input_pronunciation_ids = first_ids_proun + second_ids_proun if second_ids_proun else first_ids_proun
+            token_type_ids = [0] * len(first_ids) + ([0] * len(second_ids) if pair else [])
+
+        # Build output dictionary
+        encoded_inputs["input_ids"] = input_ids
+        encoded_inputs["input_shape_ids"] = input_shape_ids
+        encoded_inputs["input_pronunciation_ids"] = input_pronunciation_ids
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = token_type_ids
+        if return_special_tokens_mask:
+            if add_special_tokens:
+                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(first_ids, second_ids)
+            else:
+                encoded_inputs["special_tokens_mask"] = [0] * len(first_ids)
+
+        # Check lengths
+        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
+
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
+            encoded_inputs = self.pad(
+                encoded_inputs,
+                max_length=max_length,
+                padding=padding_strategy.value,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+            )
+
+        if return_length:
+            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
+
+        batch_outputs = BatchEncoding(
+            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=True
+        )
+        return batch_outputs
+
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if return_attention_mask and "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * len(required_input)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if self.padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = (
+                            encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                for key in ["input_shape_ids", "input_pronunciation_ids"]:
+                    if key in encoded_inputs:
+                        encoded_inputs[key] = encoded_inputs[key] + [self.pad_token_id] * difference
+                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
+            elif self.padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                for key in ["input_shape_ids", "input_pronunciation_ids"]:
+                    if key in encoded_inputs:
+                        encoded_inputs[key] = [self.pad_token_id] * difference + encoded_inputs[key]
+                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+            else:
+                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+
+        return encoded_inputs
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_token_to_shape_id(self, token):
+        """Converts a token (str) in an shape_id using the shape vocab."""
+        return self.word_shape.get(token, self.word_shape.get(self.unk_token))
+
+    def _convert_token_to_pronunciation_id(self, token):
+        """Converts a token (str) in an shape_id using the shape vocab."""
+        return self.word_pronunciation.get(token, self.word_pronunciation.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
+            cls_token_id: int = None, sep_token_id: int = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        cls = [self.cls_token_id] if cls_token_id is None else [cls_token_id]
+        sep = [self.sep_token_id] if sep_token_id is None else [sep_token_id]
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
+            already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
+        pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory,
+                (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
+            )
+            word_shape_file = os.path.join(
+                save_directory,
+                (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["word_shape_file"]
+            )
+            word_pronunciation_file = os.path.join(
+                save_directory,
+                (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["word_pronunciation_file"]
+            )
+        else:
+            raise ValueError(
+                f"Can't find a directory at path '{save_directory}'. To load the vocabulary from a Google "
+                "pretrained model use `tokenizer = RocBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+
+        with open(word_shape_file, "w", encoding="utf8") as writer:
+            json.dump(self.word_shape, writer, ensure_ascii=False, indent=4, separators=(', ', ': '))
+
+        with open(word_pronunciation_file, "w", encoding="utf8") as writer:
+            json.dump(self.word_pronunciation, writer, ensure_ascii=False, indent=4, separators=(', ', ': '))
+
+        return (vocab_file, word_shape_file, word_pronunciation_file,)
diff --git a/tests/models/roc_bert/__init__.py b/tests/models/roc_bert/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/roc_bert/log.txt b/tests/models/roc_bert/log.txt
new file mode 100644
index 0000000000000..b6282c70c735e
--- /dev/null
+++ b/tests/models/roc_bert/log.txt
@@ -0,0 +1,43 @@
+2022-10-31 17:23:02.406472: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
+2022-10-31 17:23:02.406519: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
+WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
+============================= test session starts ==============================
+platform linux -- Python 3.8.13, pytest-7.1.2, pluggy-1.0.0
+rootdir: /data/git_code/transformers, configfile: setup.cfg
+plugins: typeguard-2.13.3, timeout-2.1.0, metadata-2.0.2, hypothesis-6.56.3, anyio-3.6.1, xdist-3.0.2, dash-2.6.2, asyncio-0.19.0, html-3.1.1
+asyncio: mode=strict
+collected 0 items
+
+=============================== warnings summary ===============================
+../../../../conda_finder/lib/python3.8/site-packages/flatbuffers/compat.py:19
+  /data/git_code/conda_finder/lib/python3.8/site-packages/flatbuffers/compat.py:19: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
+    import imp
+
+../../../../conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:36
+  /data/git_code/conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:36: DeprecationWarning: NEAREST is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.NEAREST or Dither.NONE instead.
+    'nearest': pil_image.NEAREST,
+
+../../../../conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:37
+  /data/git_code/conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:37: DeprecationWarning: BILINEAR is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BILINEAR instead.
+    'bilinear': pil_image.BILINEAR,
+
+../../../../conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:38
+  /data/git_code/conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:38: DeprecationWarning: BICUBIC is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BICUBIC instead.
+    'bicubic': pil_image.BICUBIC,
+
+../../../../conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:39
+  /data/git_code/conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:39: DeprecationWarning: HAMMING is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.HAMMING instead.
+    'hamming': pil_image.HAMMING,
+
+../../../../conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:40
+  /data/git_code/conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:40: DeprecationWarning: BOX is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BOX instead.
+    'box': pil_image.BOX,
+
+../../../../conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:41
+  /data/git_code/conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:41: DeprecationWarning: LANCZOS is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.LANCZOS instead.
+    'lanczos': pil_image.LANCZOS,
+
+-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
+============================= 7 warnings in 0.01s ==============================
+ERROR: file or directory not found: tests/models/roc_bert/test_tokenization_roc_bert.py
+
diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py
new file mode 100644
index 0000000000000..ac0ee0e13d928
--- /dev/null
+++ b/tests/models/roc_bert/test_modeling_roc_bert.py
@@ -0,0 +1,541 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch RocBert model. """
+
+import unittest
+
+from transformers import RocBertConfig
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_modeling_common import floats_tensor
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        RocBertForCausalLM,
+        RocBertForMaskedLM,
+        RocBertForMultipleChoice,
+        RocBertForQuestionAnswering,
+        RocBertForSequenceClassification,
+        RocBertForTokenClassification,
+        RocBertModel,
+    )
+    from transformers.models.roc_bert.modeling_roc_bert import (
+        ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+    )
+
+
+class RocBertModelTester:
+    def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            pronunciation_vocab_size=99,
+            shape_vocab_size=99,
+            pronunciation_embed_dim=32,
+            shape_embed_dim=32,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.pronunciation_vocab_size = pronunciation_vocab_size
+        self.shape_vocab_size = shape_vocab_size
+        self.pronunciation_embed_dim = pronunciation_embed_dim
+        self.shape_embed_dim = shape_embed_dim
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_shape_ids = ids_tensor([self.batch_size, self.seq_length], self.shape_vocab_size)
+        input_pronunciation_ids = ids_tensor([self.batch_size, self.seq_length], self.pronunciation_vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, input_shape_ids, input_pronunciation_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return RocBertConfig(
+            vocab_size=self.vocab_size,
+            shape_vocab_size=self.shape_vocab_size,
+            pronunciation_vocab_size=self.pronunciation_vocab_size,
+            shape_embed_dim=self.shape_embed_dim,
+            pronunciation_embed_dim=self.pronunciation_embed_dim,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            input_shape_ids,
+            input_pronunciation_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            input_shape_ids,
+            input_pronunciation_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+            self, config, input_ids, input_shape_ids, input_pronunciation_ids, token_type_ids, input_mask,
+            sequence_labels, token_labels, choice_labels
+    ):
+        model = RocBertModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids,
+                       attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids,
+                       token_type_ids=token_type_ids)
+        result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+            self,
+            config,
+            input_ids,
+            input_shape_ids,
+            input_pronunciation_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = RocBertModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids,
+                       attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+            self,
+            config,
+            input_ids,
+            input_shape_ids,
+            input_pronunciation_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+    ):
+        model = RocBertForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids,
+                       attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_masked_lm(
+            self, config, input_ids, input_shape_ids, input_pronunciation_ids, token_type_ids, input_mask,
+            sequence_labels, token_labels, choice_labels
+    ):
+        model = RocBertForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids,
+                       attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+            self,
+            config,
+            input_ids,
+            input_shape_ids,
+            input_pronunciation_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = RocBertForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_shape_tokens = ids_tensor((self.batch_size, 3), config.shape_vocab_size)
+        next_pronunciation_tokens = ids_tensor((self.batch_size, 3), config.pronunciation_vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_input_shape_ids = torch.cat([input_shape_ids, next_shape_tokens], dim=-1)
+        next_input_pronunciation_ids = torch.cat([input_pronunciation_ids, next_pronunciation_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            input_shape_ids=next_input_shape_ids,
+            input_pronunciation_ids=next_input_pronunciation_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            input_shape_ids=next_shape_tokens,
+            input_pronunciation_ids=next_pronunciation_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_question_answering(
+            self, config, input_ids, input_shape_ids, input_pronunciation_ids, token_type_ids, input_mask,
+            sequence_labels, token_labels, choice_labels
+    ):
+        model = RocBertForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+            self, config, input_ids, input_shape_ids, input_pronunciation_ids, token_type_ids, input_mask,
+            sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = RocBertForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids,
+                       attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+            self, config, input_ids, input_shape_ids, input_pronunciation_ids, token_type_ids, input_mask,
+            sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = RocBertForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids,
+                       attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+            self, config, input_ids, input_shape_ids, input_pronunciation_ids, token_type_ids, input_mask,
+            sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = RocBertForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_inputs_shape_ids = input_shape_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_inputs_pronunciation_ids = input_pronunciation_ids.unsqueeze(1).expand(-1, self.num_choices,
+                                                                                               -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            input_shape_ids=multiple_choice_inputs_shape_ids,
+            input_pronunciation_ids=multiple_choice_inputs_pronunciation_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            input_shape_ids,
+            input_pronunciation_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "input_shape_ids": input_shape_ids,
+                       "input_pronunciation_ids": input_pronunciation_ids, "token_type_ids": token_type_ids,
+                       "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class RocBertModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            RocBertModel,
+            RocBertForMaskedLM,
+            RocBertForCausalLM,
+            RocBertForMultipleChoice,
+            RocBertForQuestionAnswering,
+            RocBertForSequenceClassification,
+            RocBertForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (RocBertForCausalLM,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = RocBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=RocBertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            input_shape_ids,
+            input_pronunciation_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            input_shape_ids,
+            input_pronunciation_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = RocBertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class RocBertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_masked_lm(self):
+        model = RocBertForMaskedLM.from_pretrained("roc-bert-base-cased")
+        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        # TODO Replace vocab size
+        vocab_size = 32000
+
+        expected_shape = torch.Size((1, 6, vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+
+        # TODO Replace values below with what was printed above.
+        expected_slice = torch.tensor(
+            [[[-0.0483, 0.1188, -0.0313], [-0.0606, 0.1435, 0.0199], [-0.0235, 0.1519, 0.0175]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/roc_bert/test_tokenization_roc_bert.py b/tests/models/roc_bert/test_tokenization_roc_bert.py
new file mode 100644
index 0000000000000..3ed260ccd5575
--- /dev/null
+++ b/tests/models/roc_bert/test_tokenization_roc_bert.py
@@ -0,0 +1,332 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import unittest
+
+from transformers.models.roc_bert.tokenization_roc_bert import (
+    VOCAB_FILES_NAMES,
+    RocBertBasicTokenizer,
+    RocBertTokenizer,
+    RocBertWordpieceTokenizer,
+    _is_control,
+    _is_punctuation,
+    _is_whitespace,
+)
+from transformers.testing_utils import require_tokenizers, slow
+from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english
+
+
+@require_tokenizers
+class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = RocBertTokenizer
+    rust_tokenizer_class = None
+    test_rust_tokenizer = False
+    space_between_special_tokens = True
+    from_pretrained_filter = filter_non_english
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "你",
+            "好",
+            "是",
+            "谁",
+            "a",
+            "b",
+            "c",
+            "d",
+        ]
+        word_shape = dict()
+        word_pronunciation = dict()
+        for i, value in enumerate(vocab_tokens):
+            word_shape[value] = i
+            word_pronunciation[value] = i
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.word_shape_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["word_shape_file"])
+        self.word_pronunciation_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["word_pronunciation_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+        with open(self.word_shape_file, "w", encoding="utf-8") as word_shape_writer:
+            json.dump(word_shape, word_shape_writer, ensure_ascii=False)
+        with open(self.word_pronunciation_file, "w", encoding="utf-8") as word_pronunciation_writer:
+            json.dump(word_pronunciation, word_pronunciation_writer, ensure_ascii=False)
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file, self.word_shape_file, self.word_pronunciation_file)
+
+        tokens = tokenizer.tokenize("你好[SEP]你是谁")
+        self.assertListEqual(tokens, ["你", "好", "[SEP]", "你", "是", "谁"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [5, 6, 2, 5, 7, 8])
+        self.assertListEqual(tokenizer.convert_tokens_to_shape_ids(tokens), [5, 6, 2, 5, 7, 8])
+        self.assertListEqual(tokenizer.convert_tokens_to_pronunciation_ids(tokens), [5, 6, 2, 5, 7, 8])
+
+    # Copied from tests.models.bert.test_tokenization_bert.test_chinese with BasicTokenizer->RocBertBertBasicTokenizer
+    def test_chinese(self):
+        tokenizer = RocBertBasicTokenizer()
+
+        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_lower with BasicTokenizer->RocBertBertBasicTokenizer
+    def test_basic_tokenizer_lower(self):
+        tokenizer = RocBertBasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_lower_strip_accents_false with BasicTokenizer->RocBertBertBasicTokenizer
+    def test_basic_tokenizer_lower_strip_accents_false(self):
+        tokenizer = RocBertBasicTokenizer(do_lower_case=True, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_lower_strip_accents_true with BertBasicTokenizer->RocBertBertBasicTokenizer
+    def test_basic_tokenizer_lower_strip_accents_true(self):
+        tokenizer = RocBertBasicTokenizer(do_lower_case=True, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_lower_strip_accents_default with BasicTokenizer->RocBertBertBasicTokenizer
+    def test_basic_tokenizer_lower_strip_accents_default(self):
+        tokenizer = RocBertBasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_no_lower with BasicTokenizer->RocBertBertBasicTokenizer
+    def test_basic_tokenizer_no_lower(self):
+        tokenizer = RocBertBasicTokenizer(do_lower_case=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_no_lower_strip_accents_false with BertBasicTokenizer->RocBertBertBasicTokenizer
+    def test_basic_tokenizer_no_lower_strip_accents_false(self):
+        tokenizer = RocBertBasicTokenizer(do_lower_case=False, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_no_lower_strip_accents_true with BasicTokenizer->RocBertBertBasicTokenizer
+    def test_basic_tokenizer_no_lower_strip_accents_true(self):
+        tokenizer = RocBertBasicTokenizer(do_lower_case=False, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_respects_never_split_tokens with BasicTokenizer->RocBertBertBasicTokenizer
+    def test_basic_tokenizer_respects_never_split_tokens(self):
+        tokenizer = RocBertBasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
+        )
+
+    # Copied from tests.models.bert.test_tokenization_bert.test_wordpiece_tokenizer with WordpieceTokenizer->RocBertWordpieceTokenizer
+    def test_wordpiece_tokenizer(self):
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
+
+        vocab = {}
+        for i, token in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = RocBertWordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(""), [])
+
+        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
+
+        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
+
+    # Copied from tests.models.bert.test_tokenization_bert.test_is_whitespace
+    def test_is_whitespace(self):
+        self.assertTrue(_is_whitespace(" "))
+        self.assertTrue(_is_whitespace("\t"))
+        self.assertTrue(_is_whitespace("\r"))
+        self.assertTrue(_is_whitespace("\n"))
+        self.assertTrue(_is_whitespace("\u00A0"))
+
+        self.assertFalse(_is_whitespace("A"))
+        self.assertFalse(_is_whitespace("-"))
+
+    # Copied from tests.models.bert.test_tokenization_bert.test_is_control
+    def test_is_control(self):
+        self.assertTrue(_is_control("\u0005"))
+
+        self.assertFalse(_is_control("A"))
+        self.assertFalse(_is_control(" "))
+        self.assertFalse(_is_control("\t"))
+        self.assertFalse(_is_control("\r"))
+
+    # Copied from tests.models.bert.test_tokenization_bert.test_is_punctuation
+    def test_is_punctuation(self):
+        self.assertTrue(_is_punctuation("-"))
+        self.assertTrue(_is_punctuation("$"))
+        self.assertTrue(_is_punctuation("`"))
+        self.assertTrue(_is_punctuation("."))
+
+        self.assertFalse(_is_punctuation("A"))
+        self.assertFalse(_is_punctuation(" "))
+
+    def test_clean_text(self):
+        tokenizer = self.get_tokenizer()
+
+        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
+        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
+
+        if self.test_rust_tokenizer:
+            rust_tokenizer = self.get_rust_tokenizer()
+            self.assertListEqual(
+                [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
+            )
+
+    # Copied from tests.models.bert.test_tokenization_bert. test_offsets_with_special_characters
+    def test_offsets_with_special_characters(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
+                tokens = tokenizer_r.encode_plus(
+                    sentence,
+                    return_attention_mask=False,
+                    return_token_type_ids=False,
+                    return_offsets_mapping=True,
+                    add_special_tokens=True,
+                )
+
+                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
+                expected_results = (
+                    [
+                        ((0, 0), tokenizer_r.cls_token),
+                        ((0, 1), "A"),
+                        ((1, 2), ","),
+                        ((3, 5), "na"),
+                        ((5, 6), "##ï"),
+                        ((6, 8), "##ve"),
+                        ((9, 15), tokenizer_r.mask_token),
+                        ((16, 21), "Allen"),
+                        ((21, 23), "##NL"),
+                        ((23, 24), "##P"),
+                        ((25, 33), "sentence"),
+                        ((33, 34), "."),
+                        ((0, 0), tokenizer_r.sep_token),
+                    ]
+                    if not do_lower_case
+                    else [
+                        ((0, 0), tokenizer_r.cls_token),
+                        ((0, 1), "a"),
+                        ((1, 2), ","),
+                        ((3, 8), "naive"),
+                        ((9, 15), tokenizer_r.mask_token),
+                        ((16, 21), "allen"),
+                        ((21, 23), "##nl"),
+                        ((23, 24), "##p"),
+                        ((25, 33), "sentence"),
+                        ((33, 34), "."),
+                        ((0, 0), tokenizer_r.sep_token),
+                    ]
+                )
+
+                self.assertEqual(
+                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
+                )
+                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
+
+    # Copied from tests.models.bert.test_tokenization_bert. test_change_tokenize_chinese_chars
+    def test_change_tokenize_chinese_chars(self):
+        list_of_commun_chinese_char = ["的", "人", "有"]
+        text_with_chinese_char = "".join(list_of_commun_chinese_char)
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                kwargs["tokenize_chinese_chars"] = True
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
+                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
+
+                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
+                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
+
+                # it is expected that each Chinese character is not preceded by "##"
+                self.assertListEqual(tokens_without_spe_char_p, list_of_commun_chinese_char)
+                self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
+
+                kwargs["tokenize_chinese_chars"] = False
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
+                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
+
+                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
+                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
+
+                # it is expected that only the first Chinese character is not preceded by "##".
+                expected_tokens = [
+                    f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_commun_chinese_char)
+                ]
+                self.assertListEqual(tokens_without_spe_char_p, expected_tokens)
+                self.assertListEqual(tokens_without_spe_char_r, expected_tokens)
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class(self.vocab_file, self.word_shape_file, self.word_pronunciation_file)
+
+        text = tokenizer.encode("你好", add_special_tokens=False)
+        text_2 = tokenizer.encode("你是谁", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == [101] + text + [102]
+        assert encoded_pair == [101] + text + [102] + text_2 + [102]
+
+    def test_prepare_for_model(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                string_sequence = "你好，你是谁"
+                tokens = tokenizer.tokenize(string_sequence)
+                tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
+                tokens_shape_ids = tokenizer.convert_tokens_to_shape_ids(tokens)
+                tokens_proun_ids = tokenizer.convert_tokens_to_pronunciation_ids(tokens)
+                prepared_input_dict = tokenizer.prepare_for_model(tokens_ids, tokens_shape_ids, tokens_proun_ids,
+                                                                  add_special_tokens=True)
+
+                input_dict = tokenizer.encode_plus(string_sequence, add_special_tokens=True)
+
+                self.assertEqual(input_dict, prepared_input_dict)

From 3a6462294286b8b2212c35014614f23af343e5dc Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Tue, 1 Nov 2022 10:39:16 +0800
Subject: [PATCH 02/16] update roc_bert readme

---
 docs/source/en/model_doc/roc_bert.mdx | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/docs/source/en/model_doc/roc_bert.mdx b/docs/source/en/model_doc/roc_bert.mdx
index de5c9ae104e15..999888a56bffb 100644
--- a/docs/source/en/model_doc/roc_bert.mdx
+++ b/docs/source/en/model_doc/roc_bert.mdx
@@ -14,21 +14,27 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The RocBert model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>)  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
+The RocBert model was proposed in [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)  by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+It's a pretrained Chinese language model that is robust under various forms of adversarial attacks.
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+*Large-scale pretrained language models have achieved SOTA results on NLP tasks. However, they have been shown
+vulnerable to adversarial attacks especially for logographic languages like Chinese. In this work, we propose
+ROCBERT: a pretrained Chinese Bert that is robust to various forms of adversarial attacks like word perturbation,
+synonyms, typos, etc. It is pretrained with the contrastive learning objective which maximizes the label consistency
+under different synthesized adversarial examples. The model takes as input multimodal information including the
+semantic, phonetic and visual features. We show all these features are important to the model robustness since the
+attack can be performed in all the three forms. Across 5 Chinese NLU tasks, ROCBERT outperforms strong baselines under
+three blackbox adversarial algorithms without sacrificing the performance on clean testset. It also performs the best
+in the toxic content detection task under human-made attacks.*
 
-Tips:
-
-<INSERT TIPS ABOUT MODEL HERE>
-
-This model was contributed by [INSERT YOUR HF USERNAME HERE](<https://huggingface.co/<INSERT YOUR HF USERNAME HERE>). The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+This model was contributed by [weiweishi](https://huggingface.co/weiweishi).
 
 ## RocBertConfig
 
 [[autodoc]] RocBertConfig
+    - all
 
 
 ## RocBertTokenizer
@@ -40,11 +46,6 @@ This model was contributed by [INSERT YOUR HF USERNAME HERE](<https://huggingfac
     - save_vocabulary
 
 
-## RocBertTokenizerFast
-
-[[autodoc]] RocBertTokenizerFast
-
-
 ## RocBertModel
 
 [[autodoc]] RocBertModel

From a3e2405404445718a3fd04083477a59b46df7cfc Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Tue, 1 Nov 2022 10:53:56 +0800
Subject: [PATCH 03/16] code style

---
 src/transformers/models/__init__.py           |   2 +-
 .../models/auto/configuration_auto.py         |   6 +-
 src/transformers/models/auto/modeling_auto.py |  16 +-
 src/transformers/models/roc_bert/__init__.py  |   7 +-
 .../models/roc_bert/configuration_roc_bert.py |  90 ++-
 .../models/roc_bert/modeling_roc_bert.py      | 533 +++++++++---------
 .../models/roc_bert/tokenization_roc_bert.py  | 364 ++++++------
 .../models/roc_bert/test_modeling_roc_bert.py | 303 ++++++----
 .../roc_bert/test_tokenization_roc_bert.py    |   6 +-
 9 files changed, 725 insertions(+), 602 deletions(-)

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index d1428107078b6..e9aa160c66d9b 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -17,7 +17,6 @@
 # limitations under the License.
 
 from . import (
-    roc_bert,
     albert,
     auto,
     bart,
@@ -127,6 +126,7 @@
     resnet,
     retribert,
     roberta,
+    roc_bert,
     roformer,
     segformer,
     sew,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index fcedd4806ea23..f6b038b796880 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -29,7 +29,6 @@
 CONFIG_MAPPING_NAMES = OrderedDict(
     [
         # Add configs here
-        ("roc_bert", "RocBertConfig"),
         ("albert", "AlbertConfig"),
         ("bart", "BartConfig"),
         ("beit", "BeitConfig"),
@@ -123,6 +122,7 @@
         ("resnet", "ResNetConfig"),
         ("retribert", "RetriBertConfig"),
         ("roberta", "RobertaConfig"),
+        ("roc_bert", "RocBertConfig"),
         ("roformer", "RoFormerConfig"),
         ("segformer", "SegformerConfig"),
         ("sew", "SEWConfig"),
@@ -171,7 +171,6 @@
 CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
     [
         # Add archive maps here)
-        ("roc_bert", "ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bart", "BART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -257,6 +256,7 @@
         ("resnet", "RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("retribert", "RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("roberta", "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("roc_bert", "ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("roformer", "ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("segformer", "SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("sew", "SEW_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -298,7 +298,6 @@
 MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
-        ("roc_bert", "RocBert"),
         ("albert", "ALBERT"),
         ("bart", "BART"),
         ("barthez", "BARThez"),
@@ -409,6 +408,7 @@
         ("resnet", "ResNet"),
         ("retribert", "RetriBERT"),
         ("roberta", "RoBERTa"),
+        ("roc_bert", "RocBert"),
         ("roformer", "RoFormer"),
         ("segformer", "SegFormer"),
         ("sew", "SEW"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 3d6f339b1abb1..e30db246d9bac 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -28,7 +28,6 @@
 MODEL_MAPPING_NAMES = OrderedDict(
     [
         # Base model mapping
-        ("roc_bert", "RocBertModel"),
         ("albert", "AlbertModel"),
         ("bart", "BartModel"),
         ("beit", "BeitModel"),
@@ -121,6 +120,7 @@
         ("resnet", "ResNetModel"),
         ("retribert", "RetriBertModel"),
         ("roberta", "RobertaModel"),
+        ("roc_bert", "RocBertModel"),
         ("roformer", "RoFormerModel"),
         ("segformer", "SegformerModel"),
         ("sew", "SEWModel"),
@@ -219,7 +219,6 @@
 MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
     [
         # Model with LM heads mapping
-("roc_bert", "RocBertForMaskedLM"),
         ("albert", "AlbertForMaskedLM"),
         ("bart", "BartForConditionalGeneration"),
         ("bert", "BertForMaskedLM"),
@@ -270,6 +269,7 @@
         ("reformer", "ReformerModelWithLMHead"),
         ("rembert", "RemBertForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
+        ("roc_bert", "RocBertForMaskedLM"),
         ("roformer", "RoFormerForMaskedLM"),
         ("speech_to_text", "Speech2TextForConditionalGeneration"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
@@ -289,7 +289,6 @@
 MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Causal LM mapping
-        ("roc_bert", "RocBertForCausalLM"),
         ("bart", "BartForCausalLM"),
         ("bert", "BertLMHeadModel"),
         ("bert-generation", "BertGenerationDecoder"),
@@ -322,6 +321,7 @@
         ("reformer", "ReformerModelWithLMHead"),
         ("rembert", "RemBertForCausalLM"),
         ("roberta", "RobertaForCausalLM"),
+        ("roc_bert", "RocBertForCausalLM"),
         ("roformer", "RoFormerForCausalLM"),
         ("speech_to_text_2", "Speech2Text2ForCausalLM"),
         ("transfo-xl", "TransfoXLLMHeadModel"),
@@ -424,7 +424,6 @@
 MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Masked LM mapping
-("roc_bert", "RocBertForMaskedLM"),
         ("albert", "AlbertForMaskedLM"),
         ("bart", "BartForConditionalGeneration"),
         ("bert", "BertForMaskedLM"),
@@ -456,6 +455,7 @@
         ("reformer", "ReformerForMaskedLM"),
         ("rembert", "RemBertForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
+        ("roc_bert", "RocBertForMaskedLM"),
         ("roformer", "RoFormerForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("tapas", "TapasForMaskedLM"),
@@ -529,7 +529,6 @@
 MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Sequence Classification mapping
-        ("roc_bert", "RocBertForSequenceClassification"),
         ("albert", "AlbertForSequenceClassification"),
         ("bart", "BartForSequenceClassification"),
         ("bert", "BertForSequenceClassification"),
@@ -577,6 +576,7 @@
         ("reformer", "ReformerForSequenceClassification"),
         ("rembert", "RemBertForSequenceClassification"),
         ("roberta", "RobertaForSequenceClassification"),
+        ("roc_bert", "RocBertForSequenceClassification"),
         ("roformer", "RoFormerForSequenceClassification"),
         ("squeezebert", "SqueezeBertForSequenceClassification"),
         ("tapas", "TapasForSequenceClassification"),
@@ -592,7 +592,6 @@
 MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
     [
         # Model for Question Answering mapping
-        ("roc_bert", "RocBertForQuestionAnswering"),
         ("albert", "AlbertForQuestionAnswering"),
         ("bart", "BartForQuestionAnswering"),
         ("bert", "BertForQuestionAnswering"),
@@ -633,6 +632,7 @@
         ("reformer", "ReformerForQuestionAnswering"),
         ("rembert", "RemBertForQuestionAnswering"),
         ("roberta", "RobertaForQuestionAnswering"),
+        ("roc_bert", "RocBertForQuestionAnswering"),
         ("roformer", "RoFormerForQuestionAnswering"),
         ("splinter", "SplinterForQuestionAnswering"),
         ("squeezebert", "SqueezeBertForQuestionAnswering"),
@@ -668,7 +668,6 @@
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Token Classification mapping
-("roc_bert", "RocBertForTokenClassification"),
         ("albert", "AlbertForTokenClassification"),
         ("bert", "BertForTokenClassification"),
         ("big_bird", "BigBirdForTokenClassification"),
@@ -703,6 +702,7 @@
         ("qdqbert", "QDQBertForTokenClassification"),
         ("rembert", "RemBertForTokenClassification"),
         ("roberta", "RobertaForTokenClassification"),
+        ("roc_bert", "RocBertForTokenClassification"),
         ("roformer", "RoFormerForTokenClassification"),
         ("squeezebert", "SqueezeBertForTokenClassification"),
         ("xlm", "XLMForTokenClassification"),
@@ -716,7 +716,6 @@
 MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
     [
         # Model for Multiple Choice mapping
-("roc_bert", "RocBertForMultipleChoice"),
         ("albert", "AlbertForMultipleChoice"),
         ("bert", "BertForMultipleChoice"),
         ("big_bird", "BigBirdForMultipleChoice"),
@@ -742,6 +741,7 @@
         ("qdqbert", "QDQBertForMultipleChoice"),
         ("rembert", "RemBertForMultipleChoice"),
         ("roberta", "RobertaForMultipleChoice"),
+        ("roc_bert", "RocBertForMultipleChoice"),
         ("roformer", "RoFormerForMultipleChoice"),
         ("squeezebert", "SqueezeBertForMultipleChoice"),
         ("xlm", "XLMForMultipleChoice"),
diff --git a/src/transformers/models/roc_bert/__init__.py b/src/transformers/models/roc_bert/__init__.py
index 507100b875155..30377450e81a0 100644
--- a/src/transformers/models/roc_bert/__init__.py
+++ b/src/transformers/models/roc_bert/__init__.py
@@ -18,8 +18,8 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...utils import _LazyModule, OptionalDependencyNotAvailable, is_tokenizers_available
-from ...utils import is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+
 
 _import_structure = {
     "configuration_roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RocBertConfig"],
@@ -74,8 +74,8 @@
     else:
         from .modeling_roc_bert import (
             ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-            RocBertForMaskedLM,
             RocBertForCausalLM,
+            RocBertForMaskedLM,
             RocBertForMultipleChoice,
             RocBertForQuestionAnswering,
             RocBertForSequenceClassification,
@@ -87,7 +87,6 @@
         )
 
 
-
 else:
     import sys
 
diff --git a/src/transformers/models/roc_bert/configuration_roc_bert.py b/src/transformers/models/roc_bert/configuration_roc_bert.py
index 4c88137cc1e78..4a6a90a7d6e3a 100644
--- a/src/transformers/models/roc_bert/configuration_roc_bert.py
+++ b/src/transformers/models/roc_bert/configuration_roc_bert.py
@@ -12,11 +12,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" RocBert model configuration """
+""" RocBert model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
+
 logger = logging.get_logger(__name__)
 
 ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
@@ -27,21 +28,19 @@
 
 class RocBertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`~RocBertModel`].
-    It is used to instantiate an RocBert model according to the specified arguments, defining the model
-    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-    the RocBert [roc-bert-base-cased](https://huggingface.co/roc-bert-base-cased) architecture.
+    This is the configuration class to store the configuration of a [`~RocBertModel`]. It is used to instantiate an
+    RocBert model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the RocBert
+    [roc-bert-base-cased](https://huggingface.co/roc-bert-base-cased) architecture.
 
-    Configuration objects inherit from  [`PretrainedConfig`] and can be used
-    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
-    for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
         vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the RocBert model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`~RocBertModel`] or
-            [`~TFRocBertModel`].
+            `inputs_ids` passed when calling [`~RocBertModel`] or [`~TFRocBertModel`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -51,18 +50,17 @@ class RocBertConfig(PretrainedConfig):
         intermediate_size (`int`, *optional*, defaults to 3072):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
-            If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
         hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         max_position_embeddings (`int`, *optional*, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`~RocBertModel`] or
-            [`~TFRocBertModel`].
+            The vocabulary size of the `token_type_ids` passed when calling [`~RocBertModel`] or [`~TFRocBertModel`].
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
@@ -83,37 +81,36 @@ class RocBertConfig(PretrainedConfig):
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
-    ```
-"""
+    ```"""
     model_type = "roc_bert"
 
     def __init__(
-            self,
-            vocab_size=30522,
-            hidden_size=768,
-            num_hidden_layers=12,
-            num_attention_heads=12,
-            intermediate_size=3072,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=2,
-            initializer_range=0.02,
-            layer_norm_eps=1e-12,
-            use_cache=True,
-            pad_token_id=0,
-            position_embedding_type="absolute",
-            classifier_dropout=None,
-            enable_cls=True,
-            enable_pronunciation=True,
-            enable_shape=True,
-            pronunciation_embed_dim=768,
-            pronunciation_vocab_size=910,
-            shape_embed_dim=512,
-            shape_vocab_size=24858,
-            concat_input=True,
-            **kwargs
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        use_cache=True,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        classifier_dropout=None,
+        enable_cls=True,
+        enable_pronunciation=True,
+        enable_shape=True,
+        pronunciation_embed_dim=768,
+        pronunciation_vocab_size=910,
+        shape_embed_dim=512,
+        shape_vocab_size=24858,
+        concat_input=True,
+        **kwargs
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
@@ -138,7 +135,4 @@ def __init__(
         self.concat_input = concat_input
         self.position_embedding_type = position_embedding_type
         self.classifier_dropout = classifier_dropout
-        super().__init__(
-            pad_token_id=pad_token_id,
-            **kwargs
-        )
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index 825509a9315bd..c4afee7ac0c06 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -12,22 +12,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch RocBert model. """
+""" PyTorch RocBert model."""
 
 import math
 import os
-from typing import Optional, Tuple, Union, List
+from typing import List, Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from .configuration_roc_bert import RocBertConfig
 from ...activations import ACT2FN
 from ...modeling_outputs import (
-    BaseModelOutputWithPoolingAndCrossAttentions,
     BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
     MaskedLMOutput,
     MultipleChoiceModelOutput,
@@ -36,18 +35,16 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...utils import logging
+from .configuration_roc_bert import RocBertConfig
+
 
 logger = logging.get_logger(__name__)
 
@@ -91,8 +88,8 @@ def load_tf_weights_in_roc_bert(model, config, tf_checkpoint_path):
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(
-                n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
-                for n in name
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
         ):
             logger.info(f"Skipping {'/'.join(name)}")
             continue
@@ -125,7 +122,7 @@ def load_tf_weights_in_roc_bert(model, config, tf_checkpoint_path):
             array = np.transpose(array)
         try:
             assert (
-                    pointer.shape == array.shape
+                pointer.shape == array.shape
             ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
@@ -141,10 +138,12 @@ class RocBertEmbeddings(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
-        self.pronunciation_embed = nn.Embedding(config.pronunciation_vocab_size, config.pronunciation_embed_dim,
-                                                padding_idx=config.pad_token_id)
-        self.shape_embed = nn.Embedding(config.shape_vocab_size, config.shape_embed_dim,
-                                        padding_idx=config.pad_token_id)
+        self.pronunciation_embed = nn.Embedding(
+            config.pronunciation_vocab_size, config.pronunciation_embed_dim, padding_idx=config.pad_token_id
+        )
+        self.shape_embed = nn.Embedding(
+            config.shape_vocab_size, config.shape_embed_dim, padding_idx=config.pad_token_id
+        )
         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
         self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
 
@@ -178,8 +177,14 @@ def __init__(self, config):
         )
 
     def forward(
-            self, input_ids=None, input_shape_ids=None, input_pronunciation_ids=None, token_type_ids=None,
-            position_ids=None, inputs_embeds=None, past_key_values_length=0
+        self,
+        input_ids=None,
+        input_shape_ids=None,
+        input_pronunciation_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        past_key_values_length=0,
     ):
         if input_ids is not None:
             input_shape = input_ids.size()
@@ -189,7 +194,7 @@ def forward(
         seq_length = input_shape[1]
 
         if position_ids is None:
-            position_ids = self.position_ids[:, past_key_values_length: seq_length + past_key_values_length]
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
 
         # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
         # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
@@ -275,7 +280,9 @@ def __init__(self, config, position_embedding_type=None):
         self.value = nn.Linear(config.hidden_size, self.all_head_size)
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = position_embedding_type or getattr(config, "position_embedding_type", "absolute")
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
         if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
             self.max_position_embeddings = config.max_position_embeddings
             self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
@@ -288,14 +295,14 @@ def transpose_for_scores(self, x):
         return x.permute(0, 2, 1, 3)
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_value=None,
-            output_attentions=False,
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
     ):
         mixed_query_layer = self.query(hidden_states)
 
@@ -424,14 +431,14 @@ def prune_heads(self, heads):
         self.pruned_heads = self.pruned_heads.union(heads)
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_value=None,
-            output_attentions=False,
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
     ):
         self_outputs = self.self(
             hidden_states,
@@ -494,14 +501,14 @@ def __init__(self, config):
         self.output = RocBertOutput(config)
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_value=None,
-            output_attentions=False,
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
     ):
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
         self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
@@ -523,9 +530,10 @@ def forward(
 
         cross_attn_present_key_value = None
         if self.is_decoder and encoder_hidden_states is not None:
-            assert hasattr(
-                self, "crossattention"
-            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+            assert hasattr(self, "crossattention"), (
+                f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by"
+                " setting `config.add_cross_attention=True`"
+            )
 
             # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
             cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
@@ -571,17 +579,17 @@ def __init__(self, config):
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_values=None,
-            use_cache=None,
-            output_attentions=False,
-            output_hidden_states=False,
-            return_dict=True,
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
     ):
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
@@ -728,8 +736,8 @@ def forward(self, sequence_output):
 
 class RocBertPreTrainedModel(PreTrainedModel):
     """
-    An abstract class to handle weights initialization and
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
     """
 
     config_class = RocBertConfig
@@ -739,7 +747,7 @@ class RocBertPreTrainedModel(PreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
@@ -760,14 +768,14 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 
 ROC_BERT_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
 
     Parameters:
         config ([`~RocBertConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
 ROC_BERT_INPUTS_DOCSTRING = r"""
@@ -775,24 +783,21 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RocBertTokenizer`].
-            See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`RocBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
         input_shape_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the shape vocabulary.
 
-            Indices can be obtained using [`RocBertTokenizer`].
-            See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`RocBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
         input_pronunciation_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the pronunciation vocabulary.
 
-            Indices can be obtained using [`RocBertTokenizer`].
-            See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`RocBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -804,15 +809,16 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             [What are attention masks?](../glossary#attention-mask)
         token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
 
             - 0 corresponds to a *sentence A* token,
             - 1 corresponds to a *sentence B* token.
 
             [What are token type IDs?](../glossary#token-type-ids)
         position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range `[0, config.max_position_embeddings - 1]`.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
@@ -822,9 +828,9 @@ def _set_gradient_checkpointing(self, module, value=False):
             - 0 indicates the head is **masked**.
 
         inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
-            than the model's internal embedding lookup matrix.
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -843,17 +849,14 @@ def _set_gradient_checkpointing(self, module, value=False):
 class RocBertModel(RocBertPreTrainedModel):
     """
 
-    The model can behave as an encoder (with only self-attention) as well
-    as a decoder, in which case a layer of cross-attention is added between
-    the self-attention layers, following the architecture described in [Attention is
-    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani,
-    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-
-    To behave as an decoder the model needs to be initialized with the
-    `is_decoder` argument of the configuration set to `True`.
-    To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
-    argument and `add_cross_attention` set to `True`; an
-    `encoder_hidden_states` is then expected as an input to the forward pass.
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
 
     # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->RocBert
@@ -892,8 +895,7 @@ def set_shape_embeddings(self, value):
     # Copied from transformers.models.bert.modeling_bert.BertModel._prune_heads
     def _prune_heads(self, heads_to_prune):
         """Prunes heads of the model.
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        See base class PreTrainedModel
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
@@ -906,42 +908,41 @@ def _prune_heads(self, heads_to_prune):
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
-            self,
-            input_ids: Optional[torch.Tensor] = None,
-            input_shape_ids: Optional[torch.Tensor] = None,
-            input_pronunciation_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            token_type_ids: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            head_mask: Optional[torch.Tensor] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            encoder_hidden_states: Optional[torch.Tensor] = None,
-            encoder_attention_mask: Optional[torch.Tensor] = None,
-            past_key_values: Optional[List[torch.FloatTensor]] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_shape_ids: Optional[torch.Tensor] = None,
+        input_pronunciation_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
         r"""
         encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-            if the model is configured as a decoder.
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
         encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
-            is used in the cross-attention if the model is configured as a decoder.
-            Mask values selected in `[0, 1]`:
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
 
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
         past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
             Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
-            instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
         use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up
-            decoding (see `past_key_values`).
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1039,7 +1040,7 @@ def forward(
         )
 
 
-@add_start_docstrings("""RocBert Model with a `language modeling` head on top. """, ROC_BERT_START_DOCSTRING)
+@add_start_docstrings("""RocBert Model with a `language modeling` head on top.""", ROC_BERT_START_DOCSTRING)
 class RocBertForMaskedLM(RocBertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
@@ -1076,28 +1077,27 @@ def set_output_embeddings(self, new_embeddings):
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
-            self,
-            input_ids: Optional[torch.Tensor] = None,
-            input_shape_ids: Optional[torch.Tensor] = None,
-            input_pronunciation_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            token_type_ids: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            head_mask: Optional[torch.Tensor] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            encoder_hidden_states: Optional[torch.Tensor] = None,
-            encoder_attention_mask: Optional[torch.Tensor] = None,
-            labels: Optional[torch.Tensor] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_shape_ids: Optional[torch.Tensor] = None,
+        input_pronunciation_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss.
-            Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring)
-            Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels
-            in `[0, ..., config.vocab_size]`.
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1136,8 +1136,9 @@ def forward(
             attentions=outputs.attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, input_shape_ids=None, input_pronunciation_ids=None,
-                                      attention_mask=None, **model_kwargs):
+    def prepare_inputs_for_generation(
+        self, input_ids, input_shape_ids=None, input_pronunciation_ids=None, attention_mask=None, **model_kwargs
+    ):
         input_shape = input_ids.shape
         effective_batch_size = input_shape[0]
 
@@ -1153,12 +1154,16 @@ def prepare_inputs_for_generation(self, input_ids, input_shape_ids=None, input_p
         if input_pronunciation_ids is not None:
             input_pronunciation_ids = torch.cat([input_pronunciation_ids, dummy_token], dim=1)
 
-        return {"input_ids": input_ids, "input_shape_ids": input_shape_ids,
-                "input_pronunciation_ids": input_pronunciation_ids, "attention_mask": attention_mask}
+        return {
+            "input_ids": input_ids,
+            "input_shape_ids": input_shape_ids,
+            "input_pronunciation_ids": input_pronunciation_ids,
+            "attention_mask": attention_mask,
+        }
 
 
 @add_start_docstrings(
-    """RocBert Model with a `language modeling` head on top for CLM fine-tuning. """, ROC_BERT_START_DOCSTRING
+    """RocBert Model with a `language modeling` head on top for CLM fine-tuning.""", ROC_BERT_START_DOCSTRING
 )
 class RocBertForCausalLM(RocBertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
@@ -1188,23 +1193,23 @@ def set_output_embeddings(self, new_embeddings):
     @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
-            self,
-            input_ids: Optional[torch.Tensor] = None,
-            input_shape_ids: Optional[torch.Tensor] = None,
-            input_pronunciation_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            token_type_ids: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            encoder_hidden_states: Optional[torch.Tensor] = None,
-            encoder_attention_mask: Optional[torch.Tensor] = None,
-            head_mask: Optional[torch.Tensor] = None,
-            past_key_values: Optional[List[torch.Tensor]] = None,
-            labels: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_shape_ids: Optional[torch.Tensor] = None,
+        input_pronunciation_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.Tensor]] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
         r"""
         encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
@@ -1217,26 +1222,24 @@ def forward(
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
-            tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
-            tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
-            additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
-            model.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
-            cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
-            decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
-            instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional tensors are
+            only required when the model is used as a decoder in a Sequence to Sequence model.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
             `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
             ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
         use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up
-            decoding (see `past_key_values`).
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
 
         Returns:
 
@@ -1246,17 +1249,16 @@ def forward(
         >>> from transformers import RocBertTokenizer, RocBertForCausalLM, RocBertConfig
         >>> import torch
 
-        >>> tokenizer = RocBertTokenizer.from_pretrained('roc-bert-base-cased')
+        >>> tokenizer = RocBertTokenizer.from_pretrained("roc-bert-base-cased")
         >>> config = RocBertConfig.from_pretrained("roc-bert-base-cased")
         >>> config.is_decoder = True
-        >>> model = RocBertForCausalLM.from_pretrained('roc-bert-base-cased', config=config)
+        >>> model = RocBertForCausalLM.from_pretrained("roc-bert-base-cased", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
 
         >>> prediction_logits = outputs.logits
-        ```
-"""
+        ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.roc_bert(
@@ -1301,8 +1303,15 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, input_shape_ids=None, input_pronunciation_ids=None, past=None,
-                                      attention_mask=None, **model_kwargs):
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        input_shape_ids=None,
+        input_pronunciation_ids=None,
+        past=None,
+        attention_mask=None,
+        **model_kwargs
+    ):
         input_shape = input_ids.shape
 
         # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
@@ -1317,22 +1326,27 @@ def prepare_inputs_for_generation(self, input_ids, input_shape_ids=None, input_p
             if input_pronunciation_ids is not None:
                 input_pronunciation_ids = input_pronunciation_ids[:, -1:]
 
-        return {"input_ids": input_ids, "input_shape_ids": input_shape_ids,
-                "input_pronunciation_ids": input_pronunciation_ids, "attention_mask": attention_mask,
-                "past_key_values": past}
+        return {
+            "input_ids": input_ids,
+            "input_shape_ids": input_shape_ids,
+            "input_pronunciation_ids": input_pronunciation_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+        }
 
     # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel._reorder_cache
     def _reorder_cache(self, past, beam_idx):
         reordered_past = ()
         for layer_past in past:
             reordered_past += (
-            tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],)
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
         return reordered_past
 
 
 @add_start_docstrings(
     """RocBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
+    the pooled output) e.g. for GLUE tasks.""",
     ROC_BERT_START_DOCSTRING,
 )
 class RocBertForSequenceClassification(RocBertPreTrainedModel):
@@ -1359,26 +1373,25 @@ def __init__(self, config):
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
-            self,
-            input_ids: Optional[torch.Tensor] = None,
-            input_shape_ids: Optional[torch.Tensor] = None,
-            input_pronunciation_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            token_type_ids: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            head_mask: Optional[torch.Tensor] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            labels: Optional[torch.Tensor] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_shape_ids: Optional[torch.Tensor] = None,
+        input_pronunciation_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in `[0, ..., config.num_labels - 1]`.
-            If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1437,7 +1450,7 @@ def forward(
 
 @add_start_docstrings(
     """RocBert Model with a multiple choice classification head on top (a linear layer on top of
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
     ROC_BERT_START_DOCSTRING,
 )
 class RocBertForMultipleChoice(RocBertPreTrainedModel):
@@ -1455,7 +1468,9 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
-    @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_start_docstrings_to_model_forward(
+        ROC_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1463,33 +1478,36 @@ def __init__(self, config):
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
-            self,
-            input_ids: Optional[torch.Tensor] = None,
-            input_shape_ids: Optional[torch.Tensor] = None,
-            input_pronunciation_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            token_type_ids: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            head_mask: Optional[torch.Tensor] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            labels: Optional[torch.Tensor] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_shape_ids: Optional[torch.Tensor] = None,
+        input_pronunciation_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension
-            of the input tensors. (See `input_ids` above)
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
 
         input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
         input_shape_ids = input_shape_ids.view(-1, input_shape_ids.size(-1)) if input_shape_ids is not None else None
-        input_pronunciation_ids = input_pronunciation_ids.view(-1, input_pronunciation_ids.size(
-            -1)) if input_pronunciation_ids is not None else None
+        input_pronunciation_ids = (
+            input_pronunciation_ids.view(-1, input_pronunciation_ids.size(-1))
+            if input_pronunciation_ids is not None
+            else None
+        )
         attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
         token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
         position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
@@ -1538,7 +1556,7 @@ def forward(
 
 @add_start_docstrings(
     """RocBert Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""",
     ROC_BERT_START_DOCSTRING,
 )
 class RocBertForTokenClassification(RocBertPreTrainedModel):
@@ -1567,24 +1585,23 @@ def __init__(self, config):
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
-            self,
-            input_ids: Optional[torch.Tensor] = None,
-            input_shape_ids: Optional[torch.Tensor] = None,
-            input_pronunciation_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            token_type_ids: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            head_mask: Optional[torch.Tensor] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            labels: Optional[torch.Tensor] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_shape_ids: Optional[torch.Tensor] = None,
+        input_pronunciation_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ):
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the token classification loss.
-            Indices should be in `[0, ..., config.num_labels - 1]`.
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1626,7 +1643,7 @@ def forward(
 
 @add_start_docstrings(
     """RocBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
     ROC_BERT_START_DOCSTRING,
 )
 class RocBertForQuestionAnswering(RocBertPreTrainedModel):
@@ -1651,30 +1668,30 @@ def __init__(self, config):
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
-            self,
-            input_ids: Optional[torch.Tensor] = None,
-            input_shape_ids: Optional[torch.Tensor] = None,
-            input_pronunciation_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            token_type_ids: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            head_mask: Optional[torch.Tensor] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            start_positions: Optional[torch.Tensor] = None,
-            end_positions: Optional[torch.Tensor] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_shape_ids: Optional[torch.Tensor] = None,
+        input_pronunciation_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
         end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
index 633e9fab845e0..4b922b18610d1 100644
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -19,31 +19,32 @@
 import json
 import os
 import unicodedata
-from typing import List, Optional, Union, Dict, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
 from ...tokenization_utils_base import (
-    TextInput,
-    PreTokenizedInput,
-    EncodedInput,
-    PaddingStrategy,
-    TensorType,
-    TruncationStrategy,
-    BatchEncoding,
     ENCODE_KWARGS_DOCSTRING,
     ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
-    TextInputPair,
+    BatchEncoding,
+    EncodedInput,
     EncodedInputPair,
+    PaddingStrategy,
+    PreTokenizedInput,
     PreTokenizedInputPair,
+    TensorType,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
 )
-from ...utils import logging, add_end_docstrings
+from ...utils import add_end_docstrings, logging
+
 
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {
     "vocab_file": "vocab.txt",
     "word_shape_file": "word_shape.json",
-    "word_pronunciation_file": "word_pronunciation.json"
+    "word_pronunciation_file": "word_pronunciation.json",
 }
 
 # todo: change the path
@@ -86,10 +87,10 @@ def whitespace_tokenize(text):
 
 class RocBertTokenizer(PreTrainedTokenizer):
     r"""
-    Construct a RocBERT tokenizer. Based on WordPiece.
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
     Args:
+    Construct a RocBERT tokenizer. Based on WordPiece. This tokenizer inherits from [`PreTrainedTokenizer`] which
+    contains most of the main methods. Users should refer to this superclass for more information regarding those
+    methods.
         vocab_file (`str`):
             File containing the vocabulary.
         word_shape_file (`str`):
@@ -119,8 +120,7 @@ class RocBertTokenizer(PreTrainedTokenizer):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
         tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-            This should likely be deactivated for Japanese (see this
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see this
             [issue](https://github.com/huggingface/transformers/issues/328)).
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
@@ -136,21 +136,21 @@ class RocBertTokenizer(PreTrainedTokenizer):
     #                                 "token_type_ids", "attention_mask"]
 
     def __init__(
-            self,
-            vocab_file,
-            word_shape_file,
-            word_pronunciation_file,
-            do_lower_case=True,
-            do_basic_tokenize=True,
-            never_split=None,
-            unk_token="[UNK]",
-            sep_token="[SEP]",
-            pad_token="[PAD]",
-            cls_token="[CLS]",
-            mask_token="[MASK]",
-            tokenize_chinese_chars=True,
-            strip_accents=None,
-            **kwargs
+        self,
+        vocab_file,
+        word_shape_file,
+        word_pronunciation_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -223,25 +223,25 @@ def _tokenize(self, text):
         return split_tokens
 
     def _encode_plus(
-            self,
-            text: Union[TextInput, PreTokenizedInput, EncodedInput],
-            text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
-            add_special_tokens: bool = True,
-            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-            truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-            max_length: Optional[int] = None,
-            stride: int = 0,
-            is_split_into_words: bool = False,
-            pad_to_multiple_of: Optional[int] = None,
-            return_tensors: Optional[Union[str, TensorType]] = None,
-            return_token_type_ids: Optional[bool] = None,
-            return_attention_mask: Optional[bool] = None,
-            return_overflowing_tokens: bool = False,
-            return_special_tokens_mask: bool = False,
-            return_offsets_mapping: bool = False,
-            return_length: bool = False,
-            verbose: bool = True,
-            **kwargs
+        self,
+        text: Union[TextInput, PreTokenizedInput, EncodedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
     ) -> BatchEncoding:
         def get_input_ids(text):
             if isinstance(text, str):
@@ -318,57 +318,57 @@ def get_input_ids(text):
 
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def prepare_for_model(
-            self,
-            ids: List[int],
-            shape_ids: List[int],
-            pronunciation_ids: List[int],
-            pair_ids: Optional[List[int]] = None,
-            pair_shape_ids: Optional[List[int]] = None,
-            pair_pronunciation_ids: Optional[List[int]] = None,
-            add_special_tokens: bool = True,
-            padding: Union[bool, str, PaddingStrategy] = False,
-            truncation: Union[bool, str, TruncationStrategy] = None,
-            max_length: Optional[int] = None,
-            stride: int = 0,
-            pad_to_multiple_of: Optional[int] = None,
-            return_tensors: Optional[Union[str, TensorType]] = None,
-            return_token_type_ids: Optional[bool] = None,
-            return_attention_mask: Optional[bool] = None,
-            return_overflowing_tokens: bool = False,
-            return_special_tokens_mask: bool = False,
-            return_offsets_mapping: bool = False,
-            return_length: bool = False,
-            verbose: bool = True,
-            prepend_batch_axis: bool = False,
-            **kwargs
+        self,
+        ids: List[int],
+        shape_ids: List[int],
+        pronunciation_ids: List[int],
+        pair_ids: Optional[List[int]] = None,
+        pair_shape_ids: Optional[List[int]] = None,
+        pair_pronunciation_ids: Optional[List[int]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        prepend_batch_axis: bool = False,
+        **kwargs
     ) -> BatchEncoding:
         """
-                Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
-                adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
-                manages a moving window (with user defined stride) for overflowing tokens. Please Note, for *pair_ids*
-                different than `None` and *truncation_strategy = longest_first* or `True`, it is not possible to return
-                overflowing tokens. Such a combination of arguments will raise an error.
-
-                Args:
-                    ids (`List[int]`):
-                        Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
-                        `convert_tokens_to_id` methods.
-                    shape_ids (`List[int]`):
-                        Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
-                        `convert_token_to_shape_id` methods.
-                    pronunciation_ids (`List[int]`):
-                        Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
-                        `convert_token_to_pronunciation_id` methods.
-                    pair_ids (`List[int]`, *optional*):
-                        Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
-                        and `convert_tokens_to_id` methods.
-                    pair_shape_ids (`List[int]`, *optional*):
-                        Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
-                        and `convert_token_to_shape_id` methods.
-                    pair_pronunciation_ids (`List[int]`, *optional*):
-                        Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
-                        and `convert_token_to_pronunciation_id` methods.
-                """
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
+        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
+        manages a moving window (with user defined stride) for overflowing tokens. Please Note, for *pair_ids*
+        different than `None` and *truncation_strategy = longest_first* or `True`, it is not possible to return
+        overflowing tokens. Such a combination of arguments will raise an error.
+
+        Args:
+            ids (`List[int]`):
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
+                `convert_tokens_to_id` methods.
+            shape_ids (`List[int]`):
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
+                `convert_token_to_shape_id` methods.
+            pronunciation_ids (`List[int]`):
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
+                `convert_token_to_pronunciation_id` methods.
+            pair_ids (`List[int]`, *optional*):
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_tokens_to_id` methods.
+            pair_shape_ids (`List[int]`, *optional*):
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_token_to_shape_id` methods.
+            pair_pronunciation_ids (`List[int]`, *optional*):
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_token_to_pronunciation_id` methods.
+        """
 
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
@@ -392,9 +392,9 @@ def prepare_for_model(
             )
 
         if (
-                return_overflowing_tokens
-                and truncation_strategy == TruncationStrategy.LONGEST_FIRST
-                and pair_ids is not None
+            return_overflowing_tokens
+            and truncation_strategy == TruncationStrategy.LONGEST_FIRST
+            and pair_ids is not None
         ):
             raise ValueError(
                 "Not possible to return overflowing tokens for pair of sequences with the "
@@ -446,16 +446,22 @@ def prepare_for_model(
         if add_special_tokens:
             sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
             token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-            input_shape_ids = self.build_inputs_with_special_tokens(shape_ids, pair_shape_ids,
-                                                                    self.word_shape["[UNK]"], self.word_shape["[UNK]"])
-            input_pronunciation_ids = self.build_inputs_with_special_tokens(pronunciation_ids, pair_pronunciation_ids,
-                                                                            self.word_pronunciation["[UNK]"],
-                                                                            self.word_pronunciation["[UNK]"])
+            input_shape_ids = self.build_inputs_with_special_tokens(
+                shape_ids, pair_shape_ids, self.word_shape["[UNK]"], self.word_shape["[UNK]"]
+            )
+            input_pronunciation_ids = self.build_inputs_with_special_tokens(
+                pronunciation_ids,
+                pair_pronunciation_ids,
+                self.word_pronunciation["[UNK]"],
+                self.word_pronunciation["[UNK]"],
+            )
         else:
             sequence = ids + pair_ids if pair_ids else ids
             token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair_ids else [])
             input_shape_ids = shape_ids + pair_shape_ids if pair_shape_ids else shape_ids
-            input_pronunciation_ids = pronunciation_ids + pair_pronunciation_ids if pair_pronunciation_ids else pronunciation_ids
+            input_pronunciation_ids = (
+                pronunciation_ids + pair_pronunciation_ids if pair_pronunciation_ids else pronunciation_ids
+            )
 
         # Build output dictionary
         encoded_inputs["input_ids"] = sequence
@@ -492,12 +498,12 @@ def prepare_for_model(
         return batch_outputs
 
     def _pad(
-            self,
-            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
-            max_length: Optional[int] = None,
-            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-            pad_to_multiple_of: Optional[int] = None,
-            return_attention_mask: Optional[bool] = None,
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
     ) -> dict:
         # Load from model defaults
         if return_attention_mask is None:
@@ -525,7 +531,7 @@ def _pad(
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
                     encoded_inputs["token_type_ids"] = (
-                            encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
                     )
                 if "special_tokens_mask" in encoded_inputs:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
@@ -552,31 +558,31 @@ def _pad(
         return encoded_inputs
 
     def _batch_encode_plus(
-            self,
-            batch_text_or_text_pairs: Union[
-                List[TextInput],
-                List[TextInputPair],
-                List[PreTokenizedInput],
-                List[PreTokenizedInputPair],
-                List[EncodedInput],
-                List[EncodedInputPair],
-            ],
-            add_special_tokens: bool = True,
-            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-            truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-            max_length: Optional[int] = None,
-            stride: int = 0,
-            is_split_into_words: bool = False,
-            pad_to_multiple_of: Optional[int] = None,
-            return_tensors: Optional[Union[str, TensorType]] = None,
-            return_token_type_ids: Optional[bool] = None,
-            return_attention_mask: Optional[bool] = None,
-            return_overflowing_tokens: bool = False,
-            return_special_tokens_mask: bool = False,
-            return_offsets_mapping: bool = False,
-            return_length: bool = False,
-            verbose: bool = True,
-            **kwargs
+        self,
+        batch_text_or_text_pairs: Union[
+            List[TextInput],
+            List[TextInputPair],
+            List[PreTokenizedInput],
+            List[PreTokenizedInputPair],
+            List[EncodedInput],
+            List[EncodedInputPair],
+        ],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
     ) -> BatchEncoding:
         def get_input_ids(text):
             if isinstance(text, str):
@@ -657,23 +663,23 @@ def get_input_ids(text):
 
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def _batch_prepare_for_model(
-            self,
-            batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
-            batch_shape_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
-            batch_pronunciation_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
-            add_special_tokens: bool = True,
-            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-            truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-            max_length: Optional[int] = None,
-            stride: int = 0,
-            pad_to_multiple_of: Optional[int] = None,
-            return_tensors: Optional[str] = None,
-            return_token_type_ids: Optional[bool] = None,
-            return_attention_mask: Optional[bool] = None,
-            return_overflowing_tokens: bool = False,
-            return_special_tokens_mask: bool = False,
-            return_length: bool = False,
-            verbose: bool = True,
+        self,
+        batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
+        batch_shape_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
+        batch_pronunciation_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
     ) -> BatchEncoding:
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
@@ -774,8 +780,11 @@ def convert_tokens_to_string(self, tokens):
 
     # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
     def build_inputs_with_special_tokens(
-            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
-            cls_token_id: int = None, sep_token_id: int = None
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        cls_token_id: int = None,
+        sep_token_id: int = None,
     ) -> List[int]:
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
@@ -801,8 +810,7 @@ def build_inputs_with_special_tokens(
 
     # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
     def get_special_tokens_mask(
-            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
-            already_has_special_tokens: bool = False
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
@@ -831,7 +839,7 @@ def get_special_tokens_mask(
 
     # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences
     def create_token_type_ids_from_sequences(
-            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
         """
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
@@ -864,15 +872,15 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         if os.path.isdir(save_directory):
             vocab_file = os.path.join(
                 save_directory,
-                (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
+                (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"],
             )
             word_shape_file = os.path.join(
                 save_directory,
-                (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["word_shape_file"]
+                (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["word_shape_file"],
             )
             word_pronunciation_file = os.path.join(
                 save_directory,
-                (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["word_pronunciation_file"]
+                (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["word_pronunciation_file"],
             )
         else:
             raise ValueError(
@@ -892,12 +900,16 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
                 index += 1
 
         with open(word_shape_file, "w", encoding="utf8") as writer:
-            json.dump(self.word_shape, writer, ensure_ascii=False, indent=4, separators=(', ', ': '))
+            json.dump(self.word_shape, writer, ensure_ascii=False, indent=4, separators=(", ", ": "))
 
         with open(word_pronunciation_file, "w", encoding="utf8") as writer:
-            json.dump(self.word_pronunciation, writer, ensure_ascii=False, indent=4, separators=(', ', ': '))
+            json.dump(self.word_pronunciation, writer, ensure_ascii=False, indent=4, separators=(", ", ": "))
 
-        return (vocab_file, word_shape_file, word_pronunciation_file,)
+        return (
+            vocab_file,
+            word_shape_file,
+            word_pronunciation_file,
+        )
 
 
 # Copied from  transformers.models.bert.tokenization_bert.BasicTokenizer with BasicTokenizer->RocBertBasicTokenizer
@@ -1023,14 +1035,14 @@ def _is_chinese_char(self, cp):
         # space-separated words, so they are not treated specially and handled
         # like the all of the other languages.
         if (
-                (cp >= 0x4E00 and cp <= 0x9FFF)
-                or (cp >= 0x3400 and cp <= 0x4DBF)  #
-                or (cp >= 0x20000 and cp <= 0x2A6DF)  #
-                or (cp >= 0x2A700 and cp <= 0x2B73F)  #
-                or (cp >= 0x2B740 and cp <= 0x2B81F)  #
-                or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
-                or (cp >= 0xF900 and cp <= 0xFAFF)
-                or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
         ):  #
             return True
 
diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py
index ac0ee0e13d928..3397e6aedc58c 100644
--- a/tests/models/roc_bert/test_modeling_roc_bert.py
+++ b/tests/models/roc_bert/test_modeling_roc_bert.py
@@ -16,12 +16,12 @@
 
 import unittest
 
-from transformers import RocBertConfig
-from transformers import is_torch_available
+from transformers import RocBertConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
+
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_modeling_common import floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
 
 if is_torch_available():
     import torch
@@ -35,40 +35,38 @@
         RocBertForTokenClassification,
         RocBertModel,
     )
-    from transformers.models.roc_bert.modeling_roc_bert import (
-        ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
+    from transformers.models.roc_bert.modeling_roc_bert import ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
 class RocBertModelTester:
     def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            pronunciation_vocab_size=99,
-            shape_vocab_size=99,
-            pronunciation_embed_dim=32,
-            shape_embed_dim=32,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        pronunciation_vocab_size=99,
+        shape_vocab_size=99,
+        pronunciation_embed_dim=32,
+        shape_embed_dim=32,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -120,7 +118,17 @@ def prepare_config_and_inputs(self):
 
         config = self.get_config()
 
-        return config, input_ids, input_shape_ids, input_pronunciation_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        return (
+            config,
+            input_ids,
+            input_shape_ids,
+            input_pronunciation_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
 
     def get_config(self):
         return RocBertConfig(
@@ -174,32 +182,49 @@ def prepare_config_and_inputs_for_decoder(self):
         )
 
     def create_and_check_model(
-            self, config, input_ids, input_shape_ids, input_pronunciation_ids, token_type_ids, input_mask,
-            sequence_labels, token_labels, choice_labels
+        self,
+        config,
+        input_ids,
+        input_shape_ids,
+        input_pronunciation_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
     ):
         model = RocBertModel(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids,
-                       attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids,
-                       token_type_ids=token_type_ids)
+        result = model(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+        )
+        result = model(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            token_type_ids=token_type_ids,
+        )
         result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
     def create_and_check_model_as_decoder(
-            self,
-            config,
-            input_ids,
-            input_shape_ids,
-            input_pronunciation_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
+        self,
+        config,
+        input_ids,
+        input_shape_ids,
+        input_pronunciation_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
     ):
         config.add_cross_attention = True
         model = RocBertModel(config)
@@ -222,55 +247,80 @@ def create_and_check_model_as_decoder(
             token_type_ids=token_type_ids,
             encoder_hidden_states=encoder_hidden_states,
         )
-        result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids,
-                       attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+        )
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
     def create_and_check_for_causal_lm(
-            self,
-            config,
-            input_ids,
-            input_shape_ids,
-            input_pronunciation_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
+        self,
+        config,
+        input_ids,
+        input_shape_ids,
+        input_pronunciation_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
     ):
         model = RocBertForCausalLM(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids,
-                       attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        result = model(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=token_labels,
+        )
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
 
     def create_and_check_for_masked_lm(
-            self, config, input_ids, input_shape_ids, input_pronunciation_ids, token_type_ids, input_mask,
-            sequence_labels, token_labels, choice_labels
+        self,
+        config,
+        input_ids,
+        input_shape_ids,
+        input_pronunciation_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
     ):
         model = RocBertForMaskedLM(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids,
-                       attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        result = model(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=token_labels,
+        )
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
 
     def create_and_check_decoder_model_past_large_inputs(
-            self,
-            config,
-            input_ids,
-            input_shape_ids,
-            input_pronunciation_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
+        self,
+        config,
+        input_ids,
+        input_shape_ids,
+        input_pronunciation_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
     ):
         config.is_decoder = True
         config.add_cross_attention = True
@@ -333,8 +383,16 @@ def create_and_check_decoder_model_past_large_inputs(
         self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
 
     def create_and_check_for_question_answering(
-            self, config, input_ids, input_shape_ids, input_pronunciation_ids, token_type_ids, input_mask,
-            sequence_labels, token_labels, choice_labels
+        self,
+        config,
+        input_ids,
+        input_shape_ids,
+        input_pronunciation_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
     ):
         model = RocBertForQuestionAnswering(config=config)
         model.to(torch_device)
@@ -352,32 +410,68 @@ def create_and_check_for_question_answering(
         self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
 
     def create_and_check_for_sequence_classification(
-            self, config, input_ids, input_shape_ids, input_pronunciation_ids, token_type_ids, input_mask,
-            sequence_labels, token_labels, choice_labels
+        self,
+        config,
+        input_ids,
+        input_shape_ids,
+        input_pronunciation_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
     ):
         config.num_labels = self.num_labels
         model = RocBertForSequenceClassification(config)
         model.to(torch_device)
         model.eval()
-        result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids,
-                       attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        result = model(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=sequence_labels,
+        )
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
 
     def create_and_check_for_token_classification(
-            self, config, input_ids, input_shape_ids, input_pronunciation_ids, token_type_ids, input_mask,
-            sequence_labels, token_labels, choice_labels
+        self,
+        config,
+        input_ids,
+        input_shape_ids,
+        input_pronunciation_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
     ):
         config.num_labels = self.num_labels
         model = RocBertForTokenClassification(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids,
-                       attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        result = model(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=token_labels,
+        )
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
 
     def create_and_check_for_multiple_choice(
-            self, config, input_ids, input_shape_ids, input_pronunciation_ids, token_type_ids, input_mask,
-            sequence_labels, token_labels, choice_labels
+        self,
+        config,
+        input_ids,
+        input_shape_ids,
+        input_pronunciation_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
     ):
         config.num_choices = self.num_choices
         model = RocBertForMultipleChoice(config=config)
@@ -385,8 +479,9 @@ def create_and_check_for_multiple_choice(
         model.eval()
         multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
         multiple_choice_inputs_shape_ids = input_shape_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_inputs_pronunciation_ids = input_pronunciation_ids.unsqueeze(1).expand(-1, self.num_choices,
-                                                                                               -1).contiguous()
+        multiple_choice_inputs_pronunciation_ids = (
+            input_pronunciation_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        )
         multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
         multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
         result = model(
@@ -412,9 +507,13 @@ def prepare_config_and_inputs_for_common(self):
             token_labels,
             choice_labels,
         ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "input_shape_ids": input_shape_ids,
-                       "input_pronunciation_ids": input_pronunciation_ids, "token_type_ids": token_type_ids,
-                       "attention_mask": input_mask}
+        inputs_dict = {
+            "input_ids": input_ids,
+            "input_shape_ids": input_shape_ids,
+            "input_pronunciation_ids": input_pronunciation_ids,
+            "token_type_ids": token_type_ids,
+            "attention_mask": input_mask,
+        }
         return config, inputs_dict
 
 
diff --git a/tests/models/roc_bert/test_tokenization_roc_bert.py b/tests/models/roc_bert/test_tokenization_roc_bert.py
index 3ed260ccd5575..14fdbfbecefc5 100644
--- a/tests/models/roc_bert/test_tokenization_roc_bert.py
+++ b/tests/models/roc_bert/test_tokenization_roc_bert.py
@@ -28,6 +28,7 @@
     _is_whitespace,
 )
 from transformers.testing_utils import require_tokenizers, slow
+
 from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english
 
 
@@ -324,8 +325,9 @@ def test_prepare_for_model(self):
                 tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
                 tokens_shape_ids = tokenizer.convert_tokens_to_shape_ids(tokens)
                 tokens_proun_ids = tokenizer.convert_tokens_to_pronunciation_ids(tokens)
-                prepared_input_dict = tokenizer.prepare_for_model(tokens_ids, tokens_shape_ids, tokens_proun_ids,
-                                                                  add_special_tokens=True)
+                prepared_input_dict = tokenizer.prepare_for_model(
+                    tokens_ids, tokens_shape_ids, tokens_proun_ids, add_special_tokens=True
+                )
 
                 input_dict = tokenizer.encode_plus(string_sequence, add_special_tokens=True)
 

From 1af8383bf33349f9d461515e3c504056fa22cf57 Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Tue, 1 Nov 2022 10:56:23 +0800
Subject: [PATCH 04/16] change name and delete unuse file

---
 .../models/roc_bert/configuration_roc_bert.py |   2 +-
 .../models/roc_bert/modeling_roc_bert.py      |   2 +-
 .../models/roc_bert/tokenization_roc_bert.py  |   2 +-
 .../roc_bert/tokenization_roc_bert.py.bak     | 529 ------------------
 4 files changed, 3 insertions(+), 532 deletions(-)
 delete mode 100644 src/transformers/models/roc_bert/tokenization_roc_bert.py.bak

diff --git a/src/transformers/models/roc_bert/configuration_roc_bert.py b/src/transformers/models/roc_bert/configuration_roc_bert.py
index 4a6a90a7d6e3a..f4fcca16532de 100644
--- a/src/transformers/models/roc_bert/configuration_roc_bert.py
+++ b/src/transformers/models/roc_bert/configuration_roc_bert.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 weiweishi and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 WeChatAI and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index c4afee7ac0c06..c811695dfaf73 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 weiweishi The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 WeChatAI The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
index 4b922b18610d1..e37e80d47085c 100644
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 weiweishi and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 WeChatAI and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py.bak b/src/transformers/models/roc_bert/tokenization_roc_bert.py.bak
deleted file mode 100644
index 5c43364d0d21d..0000000000000
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py.bak
+++ /dev/null
@@ -1,529 +0,0 @@
-# coding=utf-8
-# Copyright 2022 weiweishi and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for RocBert."""
-
-import collections
-import json
-import os
-from typing import List, Optional, Union, Dict, Tuple
-
-from ...tokenization_utils import PreTrainedTokenizer
-from ...tokenization_utils_base import TextInput, PreTokenizedInput, EncodedInput, PaddingStrategy, \
-    TensorType, TruncationStrategy, BatchEncoding
-from ...utils import logging
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.txt",
-    "word_shape_file": "word_shape.json",
-    "word_pronunciation_file": "word_pronunciation.json"
-}
-
-# todo: change the path
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {"roc-bert-base-uncased": "/data/git_code/wit/tmp/vocab.txt"},
-    "word_shape_file": {"roc-bert-base-uncased": "/data/git_code/wit/tmp/word_shape.json"},
-    "word_pronunciation_file": {"roc-bert-base-uncased": "/data/git_code/wit/tmp/word_shape.json"},
-}
-
-# Copied from transformers.models.bert.tokenization_bert.load_vocab
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
-
-
-class RocBertTokenizer(PreTrainedTokenizer):
-    r"""
-    Construct a RocBERT tokenizer. Based on WordPiece.
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        word_shape_file (`str`):
-            File containing the word => shape info.
-        word_pronunciation_file (`str`):
-            File containing the word => shape info.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
-            Whether or not to do basic tokenization before WordPiece.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    model_input_names: List[str] = ["input_ids", "input_shape_ids", "input_pronunciation_ids",
-                                    "token_type_ids", "attention_mask"]
-
-    def __init__(
-            self,
-            vocab_file,
-            word_shape_file,
-            word_pronunciation_file,
-            do_lower_case=True,
-            do_basic_tokenize=True,
-            never_split=None,
-            unk_token="[UNK]",
-            sep_token="[SEP]",
-            pad_token="[PAD]",
-            cls_token="[CLS]",
-            mask_token="[MASK]",
-            tokenize_chinese_chars=True,
-            strip_accents=None,
-            **kwargs
-    ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
-        for cur_file in [vocab_file, word_shape_file, word_pronunciation_file]:
-            if not os.path.isfile(cur_file):
-                raise ValueError(
-                    f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google "
-                    "pretrained model use `tokenizer = RocBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-                )
-
-        self.vocab = load_vocab(vocab_file)
-
-        with open(word_shape_file, "r", encoding="utf8") as in_file:
-            self.word_shape = json.load(in_file)
-
-        with open(word_pronunciation_file, "r", encoding="utf8") as in_file:
-            self.word_pronunciation = json.load(in_file)
-
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-
-    @property
-    def do_lower_case(self):
-        return self.basic_tokenizer.do_lower_case
-
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
-
-    def tokenize(self, text) -> List[str]:
-        """
-            Converts a string in a sequence of tokens, using the tokenizer.
-            Split in words for word-based vocabulary
-            Args:
-                text (`str`):
-                    The sequence to be encoded.
-                **kwargs (additional keyword arguments):
-                    Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
-            Returns:
-                `List[str]`: The list of tokens.
-        """
-        output_tokens = list()
-        no_split_token = set(self.unique_no_split_tokens)
-        tokens = self.tokens_trie.split(text)
-        for i, token in enumerate(tokens):
-            if token in no_split_token:
-                output_tokens.append(token)
-            else:
-                for word in token:
-                    output_tokens.append(word)
-        return output_tokens
-
-    def _encode_plus(
-            self,
-            text: Union[TextInput, PreTokenizedInput, EncodedInput],
-            text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
-            add_special_tokens: bool = True,
-            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-            truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-            max_length: Optional[int] = None,
-            stride: int = 0,
-            is_split_into_words: bool = False,
-            pad_to_multiple_of: Optional[int] = None,
-            return_tensors: Optional[Union[str, TensorType]] = None,
-            return_token_type_ids: Optional[bool] = None,
-            return_attention_mask: Optional[bool] = None,
-            return_overflowing_tokens: bool = False,
-            return_special_tokens_mask: bool = False,
-            return_offsets_mapping: bool = False,
-            return_length: bool = False,
-            verbose: bool = True,
-            **kwargs
-    ) -> BatchEncoding:
-        first_tokens = self.tokenize(text)
-        first_ids = [self._convert_token_to_id(word) for word in first_tokens]
-        first_ids_shape = [self._convert_token_to_shape_id(word) for word in first_tokens]
-        first_ids_proun = [self._convert_token_to_pronunciation_id(word) for word in first_tokens]
-
-        second_tokens = self.tokenize(text_pair) if text_pair is not None else None
-        second_ids = [self._convert_token_to_id(word) for word in second_tokens] if text_pair is not None else None
-        second_ids_shape = [self._convert_token_to_shape_id(word) for word in
-                            second_tokens] if text_pair is not None else None
-        second_ids_proun = [self._convert_token_to_pronunciation_id(word) for word in
-                            second_tokens] if text_pair is not None else None
-
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding_strategy.value,
-            truncation=truncation_strategy.value,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            verbose=verbose,
-            **kwargs,
-        )
-        pair = bool(second_ids is not None)
-        len_ids = len(first_ids)
-        len_pair_ids = len(second_ids) if pair else 0
-
-        if return_token_type_ids and not add_special_tokens:
-            raise ValueError(
-                "Asking to return token_type_ids while setting add_special_tokens to False "
-                "results in an undefined behavior. Please set add_special_tokens to True or "
-                "set return_token_type_ids to None."
-            )
-
-        if (
-                return_overflowing_tokens
-                and truncation_strategy == TruncationStrategy.LONGEST_FIRST
-                and second_ids is not None
-        ):
-            raise ValueError(
-                "Not possible to return overflowing tokens for pair of sequences with the "
-                "`longest_first`. Please select another truncation strategy than `longest_first`, "
-                "for instance `only_second` or `only_first`."
-            )
-
-            # Load from model defaults
-
-        if return_token_type_ids is None:
-            return_token_type_ids = "token_type_ids" in self.model_input_names
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        encoded_inputs = dict()
-
-        # Compute the total size of the returned encodings
-        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
-
-        # Truncation: Handle max sequence length
-        overflowing_tokens = []
-        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
-            first_ids, second_ids, overflowing_tokens = self.truncate_sequences(
-                first_ids,
-                pair_ids=second_ids,
-                num_tokens_to_remove=total_len - max_length,
-                truncation_strategy=truncation_strategy,
-                stride=stride,
-            )
-            first_ids_shape, second_ids_shape, _ = self.truncate_sequences(
-                first_ids_shape,
-                pair_ids=second_ids_shape,
-                num_tokens_to_remove=total_len - max_length,
-                truncation_strategy=truncation_strategy,
-                stride=stride,
-            )
-            first_ids_proun, second_ids_proun, _ = self.truncate_sequences(
-                first_ids_proun,
-                pair_ids=second_ids_proun,
-                num_tokens_to_remove=total_len - max_length,
-                truncation_strategy=truncation_strategy,
-                stride=stride,
-            )
-
-        if return_overflowing_tokens:
-            encoded_inputs["overflowing_tokens"] = overflowing_tokens
-            encoded_inputs["num_truncated_tokens"] = total_len - max_length
-
-        # Add special tokens
-        if add_special_tokens:
-            input_ids = self.build_inputs_with_special_tokens(first_ids, second_ids)
-            input_shape_ids = self.build_inputs_with_special_tokens(first_ids_shape, second_ids_shape,
-                                                                    self.word_shape["[UNK]"], self.word_shape["[UNK]"])
-            input_pronunciation_ids = self.build_inputs_with_special_tokens(first_ids_proun, second_ids_proun,
-                                                                            self.word_pronunciation["[UNK]"],
-                                                                            self.word_pronunciation["[UNK]"])
-            token_type_ids = self.create_token_type_ids_from_sequences(first_ids, second_ids)
-        else:
-            input_ids = first_ids + second_ids if second_ids else first_ids
-            input_shape_ids = first_ids_shape + second_ids_shape if second_ids_shape else first_ids_shape
-            input_pronunciation_ids = first_ids_proun + second_ids_proun if second_ids_proun else first_ids_proun
-            token_type_ids = [0] * len(first_ids) + ([0] * len(second_ids) if pair else [])
-
-        # Build output dictionary
-        encoded_inputs["input_ids"] = input_ids
-        encoded_inputs["input_shape_ids"] = input_shape_ids
-        encoded_inputs["input_pronunciation_ids"] = input_pronunciation_ids
-        if return_token_type_ids:
-            encoded_inputs["token_type_ids"] = token_type_ids
-        if return_special_tokens_mask:
-            if add_special_tokens:
-                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(first_ids, second_ids)
-            else:
-                encoded_inputs["special_tokens_mask"] = [0] * len(first_ids)
-
-        # Check lengths
-        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
-
-        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
-            encoded_inputs = self.pad(
-                encoded_inputs,
-                max_length=max_length,
-                padding=padding_strategy.value,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_attention_mask=return_attention_mask,
-            )
-
-        if return_length:
-            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
-
-        batch_outputs = BatchEncoding(
-            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=True
-        )
-        return batch_outputs
-
-    def _pad(
-            self,
-            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
-            max_length: Optional[int] = None,
-            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-            pad_to_multiple_of: Optional[int] = None,
-            return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        # Load from model defaults
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        required_input = encoded_inputs[self.model_input_names[0]]
-
-        if padding_strategy == PaddingStrategy.LONGEST:
-            max_length = len(required_input)
-
-        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
-            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
-
-        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
-
-        # Initialize attention mask if not present.
-        if return_attention_mask and "attention_mask" not in encoded_inputs:
-            encoded_inputs["attention_mask"] = [1] * len(required_input)
-
-        if needs_to_be_padded:
-            difference = max_length - len(required_input)
-
-            if self.padding_side == "right":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
-                if "token_type_ids" in encoded_inputs:
-                    encoded_inputs["token_type_ids"] = (
-                            encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
-                    )
-                if "special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
-                for key in ["input_shape_ids", "input_pronunciation_ids"]:
-                    if key in encoded_inputs:
-                        encoded_inputs[key] = encoded_inputs[key] + [self.pad_token_id] * difference
-                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
-                if "token_type_ids" in encoded_inputs:
-                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
-                        "token_type_ids"
-                    ]
-                if "special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
-                for key in ["input_shape_ids", "input_pronunciation_ids"]:
-                    if key in encoded_inputs:
-                        encoded_inputs[key] = [self.pad_token_id] * difference + encoded_inputs[key]
-                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
-            else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
-
-        return encoded_inputs
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    def _convert_token_to_shape_id(self, token):
-        """Converts a token (str) in an shape_id using the shape vocab."""
-        return self.word_shape.get(token, self.word_shape.get(self.unk_token))
-
-    def _convert_token_to_pronunciation_id(self, token):
-        """Converts a token (str) in an shape_id using the shape vocab."""
-        return self.word_pronunciation.get(token, self.word_pronunciation.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(
-            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
-            cls_token_id: int = None, sep_token_id: int = None
-    ) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A BERT sequence has the following format:
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        cls = [self.cls_token_id] if cls_token_id is None else [cls_token_id]
-        sep = [self.sep_token_id] if sep_token_id is None else [sep_token_id]
-        if token_ids_1 is None:
-            return cls + token_ids_0 + sep
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
-            already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def create_token_type_ids_from_sequences(
-            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
-        pair mask has the following format:
-        ```
-        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence |
-        ```
-        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        index = 0
-        if os.path.isdir(save_directory):
-            vocab_file = os.path.join(
-                save_directory,
-                (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
-            )
-            word_shape_file = os.path.join(
-                save_directory,
-                (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["word_shape_file"]
-            )
-            word_pronunciation_file = os.path.join(
-                save_directory,
-                (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["word_pronunciation_file"]
-            )
-        else:
-            raise ValueError(
-                f"Can't find a directory at path '{save_directory}'. To load the vocabulary from a Google "
-                "pretrained model use `tokenizer = RocBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-            )
-
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-
-        with open(word_shape_file, "w", encoding="utf8") as writer:
-            json.dump(self.word_shape, writer, ensure_ascii=False, indent=4, separators=(', ', ': '))
-
-        with open(word_pronunciation_file, "w", encoding="utf8") as writer:
-            json.dump(self.word_pronunciation, writer, ensure_ascii=False, indent=4, separators=(', ', ': '))
-
-        return (vocab_file, word_shape_file, word_pronunciation_file,)

From 611c5feb5f43ecb74ca7c7def5142c21865a1b30 Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Tue, 1 Nov 2022 11:36:42 +0800
Subject: [PATCH 05/16] udpate model file

---
 src/transformers/models/roc_bert/modeling_roc_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index c811695dfaf73..c329f1f0e936d 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -53,7 +53,7 @@
 _TOKENIZER_FOR_DOC = "RocBertTokenizer"
 
 ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "roc-bert-base-cased",
+    "weiweishi/roc-bert-base-zh",
     # See all RocBert models at https://huggingface.co/models?filter=roc_bert
 ]
 

From 37557c9c3694c5322a76aa9643d5c038392e3be1 Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Tue, 1 Nov 2022 11:44:28 +0800
Subject: [PATCH 06/16] delete unuse log file

---
 src/transformers/models/roc_bert/__init__.py |  2 -
 tests/models/roc_bert/log.txt                | 43 --------------------
 2 files changed, 45 deletions(-)
 delete mode 100644 tests/models/roc_bert/log.txt

diff --git a/src/transformers/models/roc_bert/__init__.py b/src/transformers/models/roc_bert/__init__.py
index 30377450e81a0..c49f7ab8400b2 100644
--- a/src/transformers/models/roc_bert/__init__.py
+++ b/src/transformers/models/roc_bert/__init__.py
@@ -31,8 +31,6 @@
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     pass
-else:
-    _import_structure["tokenization_roc_bert_fast"] = ["RocBertTokenizerFast"]
 
 try:
     if not is_torch_available():
diff --git a/tests/models/roc_bert/log.txt b/tests/models/roc_bert/log.txt
deleted file mode 100644
index b6282c70c735e..0000000000000
--- a/tests/models/roc_bert/log.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-2022-10-31 17:23:02.406472: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
-2022-10-31 17:23:02.406519: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
-WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
-============================= test session starts ==============================
-platform linux -- Python 3.8.13, pytest-7.1.2, pluggy-1.0.0
-rootdir: /data/git_code/transformers, configfile: setup.cfg
-plugins: typeguard-2.13.3, timeout-2.1.0, metadata-2.0.2, hypothesis-6.56.3, anyio-3.6.1, xdist-3.0.2, dash-2.6.2, asyncio-0.19.0, html-3.1.1
-asyncio: mode=strict
-collected 0 items
-
-=============================== warnings summary ===============================
-../../../../conda_finder/lib/python3.8/site-packages/flatbuffers/compat.py:19
-  /data/git_code/conda_finder/lib/python3.8/site-packages/flatbuffers/compat.py:19: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
-    import imp
-
-../../../../conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:36
-  /data/git_code/conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:36: DeprecationWarning: NEAREST is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.NEAREST or Dither.NONE instead.
-    'nearest': pil_image.NEAREST,
-
-../../../../conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:37
-  /data/git_code/conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:37: DeprecationWarning: BILINEAR is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BILINEAR instead.
-    'bilinear': pil_image.BILINEAR,
-
-../../../../conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:38
-  /data/git_code/conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:38: DeprecationWarning: BICUBIC is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BICUBIC instead.
-    'bicubic': pil_image.BICUBIC,
-
-../../../../conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:39
-  /data/git_code/conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:39: DeprecationWarning: HAMMING is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.HAMMING instead.
-    'hamming': pil_image.HAMMING,
-
-../../../../conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:40
-  /data/git_code/conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:40: DeprecationWarning: BOX is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BOX instead.
-    'box': pil_image.BOX,
-
-../../../../conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:41
-  /data/git_code/conda_finder/lib/python3.8/site-packages/keras/utils/image_utils.py:41: DeprecationWarning: LANCZOS is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.LANCZOS instead.
-    'lanczos': pil_image.LANCZOS,
-
--- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
-============================= 7 warnings in 0.01s ==============================
-ERROR: file or directory not found: tests/models/roc_bert/test_tokenization_roc_bert.py
-

From f685aeb8ee584468161b5314a42592a5a2d43324 Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Tue, 1 Nov 2022 11:45:34 +0800
Subject: [PATCH 07/16] delete tokenizer fast

---
 src/transformers/__init__.py | 37 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 6b434448430bc..43cc9e8a794b7 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -120,7 +120,6 @@
     ],
     "models": [],
     # Models
-    "models.roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RocBertConfig", "RocBertTokenizer"],
     "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
     "models.auto": [
         "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -316,6 +315,7 @@
     "models.resnet": ["RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ResNetConfig"],
     "models.retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig", "RetriBertTokenizer"],
     "models.roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig", "RobertaTokenizer"],
+    "models.roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RocBertConfig", "RocBertTokenizer"],
     "models.roformer": ["ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoFormerConfig", "RoFormerTokenizer"],
     "models.segformer": ["SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SegformerConfig"],
     "models.sew": ["SEW_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWConfig"],
@@ -562,7 +562,6 @@
     ]
 else:
     # Fast tokenizers structure
-    _import_structure["models.roc_bert"].append("RocBertTokenizerFast")
     _import_structure["models.albert"].append("AlbertTokenizerFast")
     _import_structure["models.bart"].append("BartTokenizerFast")
     _import_structure["models.barthez"].append("BarthezTokenizerFast")
@@ -3197,7 +3196,6 @@
         load_tf2_weights_in_pytorch_model,
     )
     from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
-    from .models.roc_bert import ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RocBertConfig, RocBertTokenizer
     from .models.auto import (
         ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CONFIG_MAPPING,
@@ -3376,6 +3374,7 @@
     from .models.resnet import RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ResNetConfig
     from .models.retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig, RetriBertTokenizer
     from .models.roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaTokenizer
+    from .models.roc_bert import ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RocBertConfig, RocBertTokenizer
     from .models.roformer import ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, RoFormerConfig, RoFormerTokenizer
     from .models.segformer import SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, SegformerConfig
     from .models.sew import SEW_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWConfig
@@ -3596,7 +3595,6 @@
         from .utils.dummy_tokenizers_objects import *
     else:
         # Fast tokenizers imports
-        from .models.roc_bert import RocBertTokenizerFast
         from .models.albert import AlbertTokenizerFast
         from .models.bart import BartTokenizerFast
         from .models.barthez import BarthezTokenizerFast
@@ -3823,22 +3821,6 @@
         )
         from .generation_utils import top_k_top_p_filtering
         from .modeling_utils import PreTrainedModel
-
-        # PyTorch model imports
-
-        from .models.roc_bert import (
-            ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-            RocBertForMaskedLM,
-            RocBertForCausalLM,
-            RocBertForMultipleChoice,
-            RocBertForQuestionAnswering,
-            RocBertForSequenceClassification,
-            RocBertForTokenClassification,
-            RocBertLayer,
-            RocBertModel,
-            RocBertPreTrainedModel,
-            load_tf_weights_in_roc_bert,
-        )
         from .models.albert import (
             ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             AlbertForMaskedLM,
@@ -4654,6 +4636,21 @@
             RobertaModel,
             RobertaPreTrainedModel,
         )
+
+        # PyTorch model imports
+        from .models.roc_bert import (
+            ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RocBertForCausalLM,
+            RocBertForMaskedLM,
+            RocBertForMultipleChoice,
+            RocBertForQuestionAnswering,
+            RocBertForSequenceClassification,
+            RocBertForTokenClassification,
+            RocBertLayer,
+            RocBertModel,
+            RocBertPreTrainedModel,
+            load_tf_weights_in_roc_bert,
+        )
         from .models.roformer import (
             ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             RoFormerForCausalLM,

From 8676555d6ae5fdd12bb9cf67414955262c0a1113 Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Tue, 1 Nov 2022 14:52:35 +0800
Subject: [PATCH 08/16] reformat code and change model file path

---
 .../models/roc_bert/configuration_roc_bert.py | 24 +++++++++++++++---
 .../models/roc_bert/modeling_roc_bert.py      | 14 ++++++-----
 .../models/roc_bert/tokenization_roc_bert.py  | 25 +++++++++++--------
 3 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/roc_bert/configuration_roc_bert.py b/src/transformers/models/roc_bert/configuration_roc_bert.py
index f4fcca16532de..6d407cbb56161 100644
--- a/src/transformers/models/roc_bert/configuration_roc_bert.py
+++ b/src/transformers/models/roc_bert/configuration_roc_bert.py
@@ -21,7 +21,7 @@
 logger = logging.get_logger(__name__)
 
 ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "roc-bert-base-cased": "https://huggingface.co/roc-bert-base-cased/resolve/main/config.json",
+    "roc-bert-base-cased": "https://huggingface.co/weiweishi/roc-bert-base-zh/resolve/main/config.json",
     # See all RocBert models at https://huggingface.co/models?filter=roc_bert
 }
 
@@ -31,7 +31,7 @@ class RocBertConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`~RocBertModel`]. It is used to instantiate an
     RocBert model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the RocBert
-    [roc-bert-base-cased](https://huggingface.co/roc-bert-base-cased) architecture.
+    [roc-bert-base-cased](https://huggingface.co/weiweishi/roc-bert-base-zh) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -68,15 +68,31 @@ class RocBertConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
+        enable_cls (`bool`, *optional*, defaults to `True`):
+            Whether or not the model use cls loss when pretrained.
+        enable_pronunciation (`bool`, *optional*, defaults to `True`):
+            Whether or not the model use pronunciation embed when training.
+        enable_shape (`bool`, *optional*, defaults to `True`):
+            Whether or not the model use shape embed when training.
+        pronunciation_embed_dim (`int`, *optional*, defaults to 768):
+            Dimension of the pronunciation_embed.
+        pronunciation_vocab_size (`int`, *optional*, defaults to 910):
+            Pronunciation Vocabulary size of the RocBert model. Defines the number of different tokens that can be
+            represented by the `input_pronunciation_ids` passed when calling [`~RocBertModel`].
+        shape_embed_dim (`int`, *optional*, defaults to 512):
+            Dimension of the shape_embed.
+        shape_vocab_size (`int`, *optional*, defaults to 24858):
+            Shape Vocabulary size of the RocBert model. Defines the number of different tokens that can be represented
+            by the `input_shape_ids` passed when calling [`~RocBertModel`].
         Example:
 
     ```python
     >>> from transformers import RocBertModel, RocBertConfig
 
-    >>> # Initializing a RocBert roc-bert-base-cased style configuration
+    >>> # Initializing a RocBert weiweishi/roc-bert-base-zh style configuration
     >>> configuration = RocBertConfig()
 
-    >>> # Initializing a model from the roc-bert-base-cased style configuration
+    >>> # Initializing a model from the weiweishi/roc-bert-base-zh style configuration
     >>> model = RocBertModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index c329f1f0e936d..19891acd8969c 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -58,6 +58,7 @@
 ]
 
 
+# Copied from transformers.models.bert.modeling_bert.load_tf_weights_in_bert with bert->roc_bert
 def load_tf_weights_in_roc_bert(model, config, tf_checkpoint_path):
     """Load tf checkpoints in a pytorch model."""
     try:
@@ -734,6 +735,7 @@ def forward(self, sequence_output):
         return prediction_scores
 
 
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel with Bert->RocBert
 class RocBertPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -793,14 +795,14 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices can be obtained using [`RocBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
-            [What are input IDs?](../glossary#input-ids)
+            [What are input IDs?](../glossary#input_shape_ids)
         input_pronunciation_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the pronunciation vocabulary.
 
             Indices can be obtained using [`RocBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
-            [What are input IDs?](../glossary#input-ids)
+            [What are input IDs?](../glossary#input_pronunciation_ids)
         attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -1249,12 +1251,12 @@ def forward(
         >>> from transformers import RocBertTokenizer, RocBertForCausalLM, RocBertConfig
         >>> import torch
 
-        >>> tokenizer = RocBertTokenizer.from_pretrained("roc-bert-base-cased")
-        >>> config = RocBertConfig.from_pretrained("roc-bert-base-cased")
+        >>> tokenizer = RocBertTokenizer.from_pretrained("weiweishi/roc-bert-base-zh")
+        >>> config = RocBertConfig.from_pretrained("weiweishi/roc-bert-base-zh")
         >>> config.is_decoder = True
-        >>> model = RocBertForCausalLM.from_pretrained("roc-bert-base-cased", config=config)
+        >>> model = RocBertForCausalLM.from_pretrained("weiweishi/roc-bert-base-zh", config=config)
 
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> inputs = tokenizer("你好，很高兴认识你", return_tensors="pt")
         >>> outputs = model(**inputs)
 
         >>> prediction_logits = outputs.logits
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
index e37e80d47085c..e509748ef6739 100644
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -49,17 +49,25 @@
 
 # todo: change the path
 PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {"roc-bert-base-uncased": "/data/git_code/wit/tmp/vocab.txt"},
-    "word_shape_file": {"roc-bert-base-uncased": "/data/git_code/wit/tmp/word_shape.json"},
-    "word_pronunciation_file": {"roc-bert-base-uncased": "/data/git_code/wit/tmp/word_shape.json"},
+    "vocab_file": {
+        "weiweishi/roc-bert-base-zh": "https://huggingface.co/weiweishi/roc-bert-base-zh/resolve/main/vocab.txt"
+    },
+    "word_shape_file": {
+        "weiweishi/roc-bert-base-zh": "https://huggingface.co/weiweishi/roc-bert-base-zh/resolve/main/word_shape.json"
+    },
+    "word_pronunciation_file": {
+        "weiweishi/roc-bert-base-zh": (
+            "https://huggingface.co/weiweishi/roc-bert-base-zh/resolve/main/word_pronunciation.json"
+        )
+    },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "roc-bert-base-uncased": 512,
+    "weiweishi/roc-bert-base-zh": 512,
 }
 
 PRETRAINED_INIT_CONFIGURATION = {
-    "roc-bert-base-uncased": {"do_lower_case": True},
+    "weiweishi/roc-bert-base-zh": {"do_lower_case": True},
 }
 
 
@@ -96,7 +104,7 @@ class RocBertTokenizer(PreTrainedTokenizer):
         word_shape_file (`str`):
             File containing the word => shape info.
         word_pronunciation_file (`str`):
-            File containing the word => shape info.
+            File containing the word => pronunciation info.
         do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
         do_basic_tokenize (`bool`, *optional*, defaults to `True`):
@@ -132,9 +140,6 @@ class RocBertTokenizer(PreTrainedTokenizer):
     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    # model_input_names: List[str] = ["input_ids", "input_shape_ids", "input_pronunciation_ids",
-    #                                 "token_type_ids", "attention_mask"]
-
     def __init__(
         self,
         vocab_file,
@@ -606,7 +611,7 @@ def get_input_ids(text):
                     tokens_proun_ids = self.convert_tokens_to_pronunciation_ids(text)
                     return tokens_ids, tokens_shape_ids, tokens_proun_ids
             elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
-                return text
+                return text, [0] * len(text), [0] * len(text)  # shape and proun id is pad_value
             else:
                 raise ValueError(
                     "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."

From 04f0dae50ddbea45bfb8e735e026b5e559f8b260 Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Wed, 2 Nov 2022 11:28:00 +0800
Subject: [PATCH 09/16] add RocBertForPreTraining

---
 src/transformers/__init__.py                  |   2 +
 src/transformers/models/auto/modeling_auto.py |   1 +
 src/transformers/models/roc_bert/__init__.py  |   2 +
 .../models/roc_bert/modeling_roc_bert.py      | 188 +++++++++++++++++-
 .../models/roc_bert/test_modeling_roc_bert.py |  39 ++++
 5 files changed, 230 insertions(+), 2 deletions(-)
 mode change 100755 => 100644 src/transformers/__init__.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
old mode 100755
new mode 100644
index 43cc9e8a794b7..25b3572d5f304
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -853,6 +853,7 @@
             "RocBertForTokenClassification",
             "RocBertLayer",
             "RocBertModel",
+            "RocBertForPreTraining",
             "RocBertPreTrainedModel",
             "load_tf_weights_in_roc_bert",
         ]
@@ -4643,6 +4644,7 @@
             RocBertForCausalLM,
             RocBertForMaskedLM,
             RocBertForMultipleChoice,
+            RocBertForPreTraining,
             RocBertForQuestionAnswering,
             RocBertForSequenceClassification,
             RocBertForTokenClassification,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index e30db246d9bac..6765015b583a8 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -197,6 +197,7 @@
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
         ("retribert", "RetriBertModel"),
         ("roberta", "RobertaForMaskedLM"),
+        ("roc_bert", "RocBertForPreTraining"),
         ("splinter", "SplinterForPreTraining"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("t5", "T5ForConditionalGeneration"),
diff --git a/src/transformers/models/roc_bert/__init__.py b/src/transformers/models/roc_bert/__init__.py
index c49f7ab8400b2..edf00f06d6e52 100644
--- a/src/transformers/models/roc_bert/__init__.py
+++ b/src/transformers/models/roc_bert/__init__.py
@@ -49,6 +49,7 @@
         "RocBertLayer",
         "RocBertModel",
         "RocBertPreTrainedModel",
+        "RocBertPreTraining",
         "load_tf_weights_in_roc_bert",
     ]
 
@@ -75,6 +76,7 @@
             RocBertForCausalLM,
             RocBertForMaskedLM,
             RocBertForMultipleChoice,
+            RocBertForPreTraining,
             RocBertForQuestionAnswering,
             RocBertForSequenceClassification,
             RocBertForTokenClassification,
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index 19891acd8969c..b21364b9dac0a 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -1042,6 +1042,190 @@ def forward(
         )
 
 
+@add_start_docstrings(
+    """
+    RocBert Model with contrastive loss and masked_lm_loss during the pretraining.
+    """,
+    ROC_BERT_START_DOCSTRING,
+)
+class RocBertForPreTraining(RocBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.roc_bert = RocBertModel(config)
+        self.cls = RocBertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_shape_ids: Optional[torch.Tensor] = None,
+        input_pronunciation_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        attack_input_ids: Optional[torch.Tensor] = None,
+        attack_input_shape_ids: Optional[torch.Tensor] = None,
+        attack_input_pronunciation_ids: Optional[torch.Tensor] = None,
+        attack_attention_mask: Optional[torch.Tensor] = None,
+        attack_token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels_input_ids: Optional[torch.Tensor] = None,
+        labels_input_shape_ids: Optional[torch.Tensor] = None,
+        labels_input_pronunciation_ids: Optional[torch.Tensor] = None,
+        labels_attention_mask: Optional[torch.Tensor] = None,
+        labels_token_type_ids: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+            attack_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                attack sample ids for computing the contrastive loss. Indices should be in `[-100, 0, ...,
+                config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
+                the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            attack_input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                attack sample shape ids for computing the contrastive loss. Indices should be in `[-100, 0, ...,
+                config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
+                the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            attack_input_pronunciation_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                attack sample pronunciation ids for computing the contrastive loss. Indices should be in `[-100, 0,
+                ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            labels_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                target ids for computing the contrastive loss and masked_lm_loss . Indices should be in `[-100, 0, ...,
+                config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
+                the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            labels_input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                target shape ids for computing the contrastive loss and masked_lm_loss . Indices should be in `[-100,
+                0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            labels_input_pronunciation_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                target pronunciation ids for computing the contrastive loss and masked_lm_loss . Indices should be in
+                `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+                 ignored (masked), the loss is only computed for the tokens with labels in `[0, ...,
+                 config.vocab_size]`
+
+            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+                Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import RocBertTokenizer, RocBertForPreTraining
+        >>> import torch
+
+        >>> tokenizer = RocBertTokenizer.from_pretrained("weiweishi/roc-bert-base-zh")
+        >>> model = RocBertForPreTraining.from_pretrained("weiweishi/roc-bert-base-zh")
+
+        >>> inputs = tokenizer("你好，很高兴认识你", return_tensors="pt")
+        >>> attack_inputs = tokenizer("你号，很高兴认识你", return_tensors="pt")
+        >>> attack_keys = list(attack_inputs.keys())
+        >>> for key in attack_keys:
+        ...     attack_inputs[f"attack_{key}"] = attack_inputs.pop(key)
+        >>> label_inputs = tokenizer("你好，很高兴认识你", return_tensors="pt")
+        >>> label_keys = list(attack_inputs.keys())
+        >>> for key in label_keys:
+        ...     label_inputs[f"labels_{key}"] = label_inputs.pop(key)
+
+        >>> inputs.update(label_inputs)
+        >>> inputs.update(attack_inputs)
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.prediction_logits
+        >>> seq_relationship_logits = outputs.seq_relationship_logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roc_bert(
+            input_ids,
+            input_shape_ids=input_shape_ids,
+            input_pronunciation_ids=input_pronunciation_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores = self.cls(sequence_output)
+
+        loss = None
+        if labels_input_ids is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels_input_ids.view(-1))
+
+            if attack_input_ids is not None:
+                batch_size, _ = labels_input_ids.shape
+                device = labels_input_ids.device
+
+                target_inputs = torch.clone(labels_input_ids)
+                target_inputs[target_inputs == -100] = 0
+
+                labels_output = self.roc_bert(
+                    labels_input_ids,
+                    input_shape_ids=labels_input_shape_ids,
+                    input_pronunciation_ids=labels_input_pronunciation_ids,
+                    attention_mask=labels_attention_mask,
+                    token_type_ids=labels_token_type_ids,
+                    return_dict=return_dict,
+                )
+                attack_output = self.roc_bert(
+                    attack_input_ids,
+                    input_shape_ids=attack_input_shape_ids,
+                    input_pronunciation_ids=attack_input_pronunciation_ids,
+                    attention_mask=attack_attention_mask,
+                    token_type_ids=attack_token_type_ids,
+                    return_dict=return_dict,
+                )
+
+                labels_pooled_output = labels_output[1]
+                attack_pooled_output = attack_output[1]
+
+                pooled_output_norm = torch.nn.functional.normalize(pooled_output, dim=-1)
+                labels_pooled_output_norm = torch.nn.functional.normalize(labels_pooled_output, dim=-1)
+                attack_pooled_output_norm = torch.nn.functional.normalize(attack_pooled_output, dim=-1)
+
+                sim_matrix = torch.matmul(pooled_output_norm, attack_pooled_output_norm.T)  # batch_size * hidden_dim
+                sim_matrix_target = torch.matmul(labels_pooled_output_norm, attack_pooled_output_norm.T)
+                batch_labels = torch.tensor([i for i in range(batch_size)], device=device)
+                contrastive_loss = (
+                    loss_fct(100 * sim_matrix.view(batch_size, -1), batch_labels.view(-1))
+                    + loss_fct(100 * sim_matrix_target.view(batch_size, -1), batch_labels.view(-1))
+                ) / 2
+
+                loss = contrastive_loss + masked_lm_loss
+            else:
+                loss = masked_lm_loss
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MaskedLMOutput(
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
 @add_start_docstrings("""RocBert Model with a `language modeling` head on top.""", ROC_BERT_START_DOCSTRING)
 class RocBertForMaskedLM(RocBertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
@@ -1564,7 +1748,7 @@ def forward(
 class RocBertForTokenClassification(RocBertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
-    # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->Ernie,bert->ernie
+    # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->RocBert,bert->roc_bert
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1651,7 +1835,7 @@ def forward(
 class RocBertForQuestionAnswering(RocBertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
-    # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->Ernie,bert->ernie
+    # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->RocBert,bert->roc_bert
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py
index 3397e6aedc58c..dcee768a274c6 100644
--- a/tests/models/roc_bert/test_modeling_roc_bert.py
+++ b/tests/models/roc_bert/test_modeling_roc_bert.py
@@ -30,6 +30,7 @@
         RocBertForCausalLM,
         RocBertForMaskedLM,
         RocBertForMultipleChoice,
+        RocBertForPreTraining,
         RocBertForQuestionAnswering,
         RocBertForSequenceClassification,
         RocBertForTokenClassification,
@@ -516,6 +517,40 @@ def prepare_config_and_inputs_for_common(self):
         }
         return config, inputs_dict
 
+    def create_and_check_for_pretraining(
+        self,
+        config,
+        input_ids,
+        input_shape_ids,
+        input_pronunciation_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        model = RocBertForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            input_shape_ids,
+            input_pronunciation_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            attack_input_ids=input_ids,
+            attack_input_shape_ids=input_shape_ids,
+            attack_input_pronunciation_ids=input_pronunciation_ids,
+            attack_attention_mask=input_mask,
+            attack_token_type_ids=token_type_ids,
+            labels_input_ids=token_labels,
+            labels_input_shape_ids=input_shape_ids,
+            labels_input_pronunciation_ids=input_pronunciation_ids,
+            labels_attention_mask=input_mask,
+            labels_token_type_ids=token_type_ids,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
 
 @require_torch
 class RocBertModelTest(ModelTesterMixin, unittest.TestCase):
@@ -575,6 +610,10 @@ def test_for_token_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
 
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
+
     def test_model_as_decoder(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)

From e2f0d65043b7ac967d07a13620dfd895ca59d210 Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Wed, 2 Nov 2022 14:28:37 +0800
Subject: [PATCH 10/16] update docs

---
 docs/source/en/model_doc/roc_bert.mdx                 | 6 ++++++
 src/transformers/models/roc_bert/__init__.py          | 2 +-
 src/transformers/models/roc_bert/modeling_roc_bert.py | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/roc_bert.mdx b/docs/source/en/model_doc/roc_bert.mdx
index 999888a56bffb..a6e69e9313ced 100644
--- a/docs/source/en/model_doc/roc_bert.mdx
+++ b/docs/source/en/model_doc/roc_bert.mdx
@@ -52,6 +52,12 @@ This model was contributed by [weiweishi](https://huggingface.co/weiweishi).
     - forward
 
 
+## RocBertForPreTraining
+
+[[autodoc]] RocBertForPreTraining
+    - forward
+
+
 ## RocBertForCausalLM
 
 [[autodoc]] RocBertForCausalLM
diff --git a/src/transformers/models/roc_bert/__init__.py b/src/transformers/models/roc_bert/__init__.py
index edf00f06d6e52..c1afa2f07edd5 100644
--- a/src/transformers/models/roc_bert/__init__.py
+++ b/src/transformers/models/roc_bert/__init__.py
@@ -49,7 +49,7 @@
         "RocBertLayer",
         "RocBertModel",
         "RocBertPreTrainedModel",
-        "RocBertPreTraining",
+        "RocBertForPreTraining",
         "load_tf_weights_in_roc_bert",
     ]
 
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index b21364b9dac0a..46b2b9c54cb52 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -48,7 +48,7 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "roc-bert-base-cased"
+_CHECKPOINT_FOR_DOC = "weiweishi/roc-bert-base-zh"
 _CONFIG_FOR_DOC = "RocBertConfig"
 _TOKENIZER_FOR_DOC = "RocBertTokenizer"
 

From d4b4dd7075852243208f8626f3c5858409f33d68 Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Wed, 2 Nov 2022 14:58:24 +0800
Subject: [PATCH 11/16] delete wrong notes

---
 src/transformers/models/roc_bert/modeling_roc_bert.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index 46b2b9c54cb52..f50127508913e 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -1145,8 +1145,7 @@ def forward(
         >>> inputs.update(attack_inputs)
         >>> outputs = model(**inputs)
 
-        >>> prediction_logits = outputs.prediction_logits
-        >>> seq_relationship_logits = outputs.seq_relationship_logits
+        >>> logits = outputs.logits
         ```
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict

From b93728a9574712cca35ec2d235a306b556973982 Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Wed, 2 Nov 2022 15:54:35 +0800
Subject: [PATCH 12/16] fix copies

---
 README.md                                     |   1 +
 README_es.md                                  |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/index.mdx                      |   2 +
 .../models/roc_bert/modeling_roc_bert.py      | 127 +++++++++---------
 .../models/roc_bert/tokenization_roc_bert.py  |  10 +-
 src/transformers/utils/dummy_pt_objects.py    |  77 +++++++++++
 9 files changed, 151 insertions(+), 70 deletions(-)

diff --git a/README.md b/README.md
index c291d5a461dd0..4537dce109bfc 100644
--- a/README.md
+++ b/README.md
@@ -363,6 +363,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RocBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
diff --git a/README_es.md b/README_es.md
index 8e6ad7d902a37..95ff3bb4dd889 100644
--- a/README_es.md
+++ b/README_es.md
@@ -363,6 +363,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RocBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
diff --git a/README_ko.md b/README_ko.md
index 4f0c005148a01..8e6012c9edb13 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -313,6 +313,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RocBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 129ab5d7ae874..ad50d7592a095 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -337,6 +337,7 @@ conda install -c huggingface transformers
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
+1. **[RocBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 5d7292b03cc02..17238c678195b 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -349,6 +349,7 @@ conda install -c huggingface transformers
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RocBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 04f82259d7ce0..57a02dd52ed5a 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -152,6 +152,7 @@ The documentation is organized into five sections:
 1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RocBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
@@ -302,6 +303,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           ResNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           RocBert           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          SegFormer          |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index f50127508913e..9e38e7f2352c4 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -122,9 +122,8 @@ def load_tf_weights_in_roc_bert(model, config, tf_checkpoint_path):
         elif m_name == "kernel":
             array = np.transpose(array)
         try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
@@ -290,21 +289,21 @@ def __init__(self, config, position_embedding_type=None):
 
         self.is_decoder = config.is_decoder
 
-    def transpose_for_scores(self, x):
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(*new_x_shape)
+        x = x.view(new_x_shape)
         return x.permute(0, 2, 1, 3)
 
     def forward(
         self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
         mixed_query_layer = self.query(hidden_states)
 
         # If this is instantiated as a cross-attention module, the keys
@@ -381,7 +380,7 @@ def forward(
 
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(*new_context_layer_shape)
+        context_layer = context_layer.view(new_context_layer_shape)
 
         outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
@@ -398,7 +397,7 @@ def __init__(self, config):
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self, hidden_states, input_tensor):
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
@@ -433,14 +432,14 @@ def prune_heads(self, heads):
 
     def forward(
         self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
         self_outputs = self.self(
             hidden_states,
             attention_mask,
@@ -465,7 +464,7 @@ def __init__(self, config):
         else:
             self.intermediate_act_fn = config.hidden_act
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
         return hidden_states
@@ -479,7 +478,7 @@ def __init__(self, config):
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self, hidden_states, input_tensor):
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
@@ -496,21 +495,22 @@ def __init__(self, config):
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         if self.add_cross_attention:
-            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
             self.crossattention = RocBertAttention(config, position_embedding_type="absolute")
         self.intermediate = RocBertIntermediate(config)
         self.output = RocBertOutput(config)
 
     def forward(
         self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
         self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
         self_attention_outputs = self.attention(
@@ -531,10 +531,11 @@ def forward(
 
         cross_attn_present_key_value = None
         if self.is_decoder and encoder_hidden_states is not None:
-            assert hasattr(self, "crossattention"), (
-                f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by"
-                " setting `config.add_cross_attention=True`"
-            )
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
 
             # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
             cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
@@ -581,17 +582,17 @@ def __init__(self, config):
 
     def forward(
         self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-    ):
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
@@ -696,7 +697,7 @@ def __init__(self, config):
             self.transform_act_fn = config.hidden_act
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.dense(hidden_states)
         hidden_states = self.transform_act_fn(hidden_states)
         hidden_states = self.LayerNorm(hidden_states)
@@ -730,12 +731,12 @@ def __init__(self, config):
         super().__init__()
         self.predictions = RocBertLMPredictionHead(config)
 
-    def forward(self, sequence_output):
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
         prediction_scores = self.predictions(sequence_output)
         return prediction_scores
 
 
-# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel with Bert->RocBert
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel with Bert->RocBert,bert->roc_bert
 class RocBertPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -896,8 +897,9 @@ def set_shape_embeddings(self, value):
 
     # Copied from transformers.models.bert.modeling_bert.BertModel._prune_heads
     def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
@@ -1240,7 +1242,7 @@ def __init__(self, config):
                 "bi-directional self-attention."
             )
 
-        self.roc_bert = RocBertModel(config)
+        self.roc_bert = RocBertModel(config, add_pooling_layer=False)
         self.cls = RocBertOnlyMLMHead(config)
 
         # Initialize weights and apply final processing
@@ -1359,9 +1361,9 @@ def __init__(self, config):
         super().__init__(config)
 
         if not config.is_decoder:
-            logger.warning("If you want to use `RocBertForCausalLM` as a standalone, add `is_decoder=True.`")
+            logger.warning("If you want to use `RocRocBertForCausalLM` as a standalone, add `is_decoder=True.`")
 
-        self.roc_bert = RocBertModel(config)
+        self.roc_bert = RocBertModel(config, add_pooling_layer=False)
         self.cls = RocBertOnlyMLMHead(config)
 
         # Initialize weights and apply final processing
@@ -1523,9 +1525,7 @@ def prepare_inputs_for_generation(
     def _reorder_cache(self, past, beam_idx):
         reordered_past = ()
         for layer_past in past:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
-            )
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
         return reordered_past
 
 
@@ -1539,8 +1539,9 @@ class RocBertForSequenceClassification(RocBertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.roc_bert = RocBertModel(config)
+        self.config = config
 
+        self.roc_bert = RocBertModel(config)
         classifier_dropout = (
             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
         )
@@ -1752,7 +1753,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.roc_bert = RocBertModel(config)
+        self.roc_bert = RocBertModel(config, add_pooling_layer=False)
         classifier_dropout = (
             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
         )
@@ -1839,7 +1840,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.roc_bert = RocBertModel(config)
+        self.roc_bert = RocBertModel(config, add_pooling_layer=False)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
         # Initialize weights and apply final processing
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
index e509748ef6739..fd82c51d984f1 100644
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -47,7 +47,6 @@
     "word_pronunciation_file": "word_pronunciation.json",
 }
 
-# todo: change the path
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
         "weiweishi/roc-bert-base-zh": "https://huggingface.co/weiweishi/roc-bert-base-zh/resolve/main/vocab.txt"
@@ -83,7 +82,7 @@ def load_vocab(vocab_file):
     return vocab
 
 
-# Copied from transformers.models.bert.tokenization_bert.load_vocab
+# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
 def whitespace_tokenize(text):
     """Runs basic whitespace cleaning and splitting on a piece of text."""
     text = text.strip()
@@ -198,12 +197,10 @@ def __init__(
             )
         self.wordpiece_tokenizer = RocBertWordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
 
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
     @property
     def do_lower_case(self):
         return self.basic_tokenizer.do_lower_case
 
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.vocab_size
     @property
     def vocab_size(self):
         return len(self.vocab)
@@ -777,13 +774,12 @@ def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
         return self.ids_to_tokens.get(index, self.unk_token)
 
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_tokens_to_string
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.convert_tokens_to_string
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (string) in a single string."""
         out_string = " ".join(tokens).replace(" ##", "").strip()
         return out_string
 
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
     def build_inputs_with_special_tokens(
         self,
         token_ids_0: List[int],
@@ -920,7 +916,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
 # Copied from  transformers.models.bert.tokenization_bert.BasicTokenizer with BasicTokenizer->RocBertBasicTokenizer
 class RocBertBasicTokenizer(object):
     """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+    Constructs a RocBertBasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
 
     Args:
         do_lower_case (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 52f3d39b5d8ef..3ddc8b7fdcce3 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4481,6 +4481,83 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class RocBertForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RocBertForMaskedLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RocBertForMultipleChoice(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RocBertForPreTraining(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RocBertForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RocBertForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RocBertForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RocBertLayer(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RocBertModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RocBertPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+def load_tf_weights_in_roc_bert(*args, **kwargs):
+    requires_backends(load_tf_weights_in_roc_bert, ["torch"])
+
+
 ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 

From 160542818afac72c91fdcfdb63f30b35af910cac Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Wed, 2 Nov 2022 19:43:40 +0800
Subject: [PATCH 13/16] fix make repo-consistency error

---
 src/transformers/models/roc_bert/__init__.py  |  6 ++-
 .../models/roc_bert/configuration_roc_bert.py |  4 +-
 .../models/roc_bert/modeling_roc_bert.py      |  5 +++
 .../models/roc_bert/test_modeling_roc_bert.py | 39 +++++++++++++++----
 4 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/roc_bert/__init__.py b/src/transformers/models/roc_bert/__init__.py
index c1afa2f07edd5..d30edab72fed0 100644
--- a/src/transformers/models/roc_bert/__init__.py
+++ b/src/transformers/models/roc_bert/__init__.py
@@ -31,6 +31,8 @@
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     pass
+else:
+    pass
 
 try:
     if not is_torch_available():
@@ -40,16 +42,16 @@
 else:
     _import_structure["modeling_roc_bert"] = [
         "ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "RocBertForMaskedLM",
         "RocBertForCausalLM",
+        "RocBertForMaskedLM",
         "RocBertForMultipleChoice",
+        "RocBertForPreTraining",
         "RocBertForQuestionAnswering",
         "RocBertForSequenceClassification",
         "RocBertForTokenClassification",
         "RocBertLayer",
         "RocBertModel",
         "RocBertPreTrainedModel",
-        "RocBertForPreTraining",
         "load_tf_weights_in_roc_bert",
     ]
 
diff --git a/src/transformers/models/roc_bert/configuration_roc_bert.py b/src/transformers/models/roc_bert/configuration_roc_bert.py
index 6d407cbb56161..885686402c496 100644
--- a/src/transformers/models/roc_bert/configuration_roc_bert.py
+++ b/src/transformers/models/roc_bert/configuration_roc_bert.py
@@ -21,7 +21,7 @@
 logger = logging.get_logger(__name__)
 
 ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "roc-bert-base-cased": "https://huggingface.co/weiweishi/roc-bert-base-zh/resolve/main/config.json",
+    "weiweishi/roc-bert-base-zh": "https://huggingface.co/weiweishi/roc-bert-base-zh/resolve/main/config.json",
     # See all RocBert models at https://huggingface.co/models?filter=roc_bert
 }
 
@@ -31,7 +31,7 @@ class RocBertConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`~RocBertModel`]. It is used to instantiate an
     RocBert model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the RocBert
-    [roc-bert-base-cased](https://huggingface.co/weiweishi/roc-bert-base-zh) architecture.
+    [weiweishi/roc-bert-base-zh](https://huggingface.co/weiweishi/roc-bert-base-zh) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index 9e38e7f2352c4..adafd037c9db6 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -1090,7 +1090,10 @@ def forward(
         labels_input_pronunciation_ids: Optional[torch.Tensor] = None,
         labels_attention_mask: Optional[torch.Tensor] = None,
         labels_token_type_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        **kwargs,
     ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
         r"""
             attack_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1161,6 +1164,8 @@ def forward(
             position_ids=position_ids,
             head_mask=head_mask,
             inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
 
diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py
index dcee768a274c6..3de5740cdc9f9 100644
--- a/tests/models/roc_bert/test_modeling_roc_bert.py
+++ b/tests/models/roc_bert/test_modeling_roc_bert.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import RocBertConfig, is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
@@ -27,6 +28,7 @@
     import torch
 
     from transformers import (
+        MODEL_FOR_PRETRAINING_MAPPING,
         RocBertForCausalLM,
         RocBertForMaskedLM,
         RocBertForMultipleChoice,
@@ -563,12 +565,39 @@ class RocBertModelTest(ModelTesterMixin, unittest.TestCase):
             RocBertForQuestionAnswering,
             RocBertForSequenceClassification,
             RocBertForTokenClassification,
+            RocBertForPreTraining,
         )
         if is_torch_available()
         else ()
     )
     all_generative_model_classes = (RocBertForCausalLM,) if is_torch_available() else ()
 
+    # special case for ForPreTraining model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
+                inputs_dict["labels_input_ids"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["labels_input_shape_ids"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["labels_input_pronunciation_ids"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["attack_input_ids"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["attack_input_shape_ids"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["attack_input_pronunciation_ids"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
     def setUp(self):
         self.model_tester = RocBertModelTester(self)
         self.config_tester = ConfigTester(self, config_class=RocBertConfig, hidden_size=37)
@@ -661,19 +690,15 @@ def test_model_from_pretrained(self):
 class RocBertModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_masked_lm(self):
-        model = RocBertForMaskedLM.from_pretrained("roc-bert-base-cased")
+        model = RocBertForMaskedLM.from_pretrained("weiweishi/roc-bert-base-zh")
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
         output = model(input_ids)[0]
 
-        # TODO Replace vocab size
-        vocab_size = 32000
+        vocab_size = 21128
 
         expected_shape = torch.Size((1, 6, vocab_size))
         self.assertEqual(output.shape, expected_shape)
 
-        # TODO Replace values below with what was printed above.
-        expected_slice = torch.tensor(
-            [[[-0.0483, 0.1188, -0.0313], [-0.0606, 0.1435, 0.0199], [-0.0235, 0.1519, 0.0175]]]
-        )
+        expected_slice = torch.tensor([[[0.6248, 0.3013, 0.3739], [0.3544, 0.8086, 0.2427], [0.3244, 0.6589, 0.1711]]])
 
         self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))

From f181e83d76996b13f0140b04a501eacd40a102fd Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Wed, 2 Nov 2022 19:50:29 +0800
Subject: [PATCH 14/16] fix files are not present in the table of contents
 error

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 59d355f0a5b5b..61ccc3cf89113 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -337,6 +337,8 @@
         title: RetriBERT
       - local: model_doc/roberta
         title: RoBERTa
+      - local: model_doc/roc_bert
+        title: RocBert
       - local: model_doc/roformer
         title: RoFormer
       - local: model_doc/splinter

From bd22af239b4264165e186bf36bed2e2851e8941f Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Thu, 3 Nov 2022 10:02:41 +0800
Subject: [PATCH 15/16] change RocBert -> RoCBert

---
 README.md                                     |   2 +-
 README_es.md                                  |   2 +-
 README_ko.md                                  |   2 +-
 README_zh-hans.md                             |   2 +-
 README_zh-hant.md                             |   2 +-
 docs/source/en/_toctree.yml                   |   2 +-
 docs/source/en/index.mdx                      |   4 +-
 docs/source/en/model_doc/roc_bert.mdx         |  44 ++---
 src/transformers/__init__.py                  |  48 ++---
 .../models/auto/configuration_auto.py         |   4 +-
 src/transformers/models/auto/modeling_auto.py |  18 +-
 src/transformers/models/roc_bert/__init__.py  |  48 ++---
 .../models/roc_bert/configuration_roc_bert.py |  34 ++--
 .../models/roc_bert/modeling_roc_bert.py      | 182 +++++++++---------
 .../models/roc_bert/tokenization_roc_bert.py  |  22 +--
 src/transformers/utils/dummy_pt_objects.py    |  20 +-
 .../models/roc_bert/test_modeling_roc_bert.py |  74 +++----
 .../roc_bert/test_tokenization_roc_bert.py    |  66 +++----
 18 files changed, 281 insertions(+), 295 deletions(-)

diff --git a/README.md b/README.md
index 4537dce109bfc..1f8c8ab120a25 100644
--- a/README.md
+++ b/README.md
@@ -363,7 +363,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RocBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
diff --git a/README_es.md b/README_es.md
index 95ff3bb4dd889..ff8ce41aad7c5 100644
--- a/README_es.md
+++ b/README_es.md
@@ -363,7 +363,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RocBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
diff --git a/README_ko.md b/README_ko.md
index 8e6012c9edb13..df76470674a8f 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -313,7 +313,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RocBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index ad50d7592a095..8be6ec78a3f57 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -337,7 +337,7 @@ conda install -c huggingface transformers
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
-1. **[RocBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。
+1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 17238c678195b..f1325d9b29a83 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -349,7 +349,7 @@ conda install -c huggingface transformers
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RocBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 61ccc3cf89113..2227ac4767d2b 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -338,7 +338,7 @@
       - local: model_doc/roberta
         title: RoBERTa
       - local: model_doc/roc_bert
-        title: RocBert
+        title: RoCBert
       - local: model_doc/roformer
         title: RoFormer
       - local: model_doc/splinter
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 57a02dd52ed5a..92bcad0707df7 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -152,7 +152,7 @@ The documentation is organized into five sections:
 1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RocBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoCBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
@@ -303,7 +303,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           ResNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           RocBert           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           RoCBert           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          SegFormer          |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/roc_bert.mdx b/docs/source/en/model_doc/roc_bert.mdx
index a6e69e9313ced..c30ccfd1c5239 100644
--- a/docs/source/en/model_doc/roc_bert.mdx
+++ b/docs/source/en/model_doc/roc_bert.mdx
@@ -10,11 +10,11 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# RocBert
+# RoCBert
 
 ## Overview
 
-The RocBert model was proposed in [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)  by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+The RoCBert model was proposed in [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)  by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 It's a pretrained Chinese language model that is robust under various forms of adversarial attacks.
 
 The abstract from the paper is the following:
@@ -31,63 +31,63 @@ in the toxic content detection task under human-made attacks.*
 
 This model was contributed by [weiweishi](https://huggingface.co/weiweishi).
 
-## RocBertConfig
+## RoCBertConfig
 
-[[autodoc]] RocBertConfig
+[[autodoc]] RoCBertConfig
     - all
 
 
-## RocBertTokenizer
+## RoCBertTokenizer
 
-[[autodoc]] RocBertTokenizer
+[[autodoc]] RoCBertTokenizer
     - build_inputs_with_special_tokens
     - get_special_tokens_mask
     - create_token_type_ids_from_sequences
     - save_vocabulary
 
 
-## RocBertModel
+## RoCBertModel
 
-[[autodoc]] RocBertModel
+[[autodoc]] RoCBertModel
     - forward
 
 
-## RocBertForPreTraining
+## RoCBertForPreTraining
 
-[[autodoc]] RocBertForPreTraining
+[[autodoc]] RoCBertForPreTraining
     - forward
 
 
-## RocBertForCausalLM
+## RoCBertForCausalLM
 
-[[autodoc]] RocBertForCausalLM
+[[autodoc]] RoCBertForCausalLM
     - forward
 
 
-## RocBertForMaskedLM
+## RoCBertForMaskedLM
 
-[[autodoc]] RocBertForMaskedLM
+[[autodoc]] RoCBertForMaskedLM
     - forward
 
 
-## RocBertForSequenceClassification
+## RoCBertForSequenceClassification
 
-[[autodoc]] transformers.RocBertForSequenceClassification
+[[autodoc]] transformers.RoCBertForSequenceClassification
     - forward
 
-## RocBertForMultipleChoice
+## RoCBertForMultipleChoice
 
-[[autodoc]] transformers.RocBertForMultipleChoice
+[[autodoc]] transformers.RoCBertForMultipleChoice
     - forward
 
 
-## RocBertForTokenClassification
+## RoCBertForTokenClassification
 
-[[autodoc]] transformers.RocBertForTokenClassification
+[[autodoc]] transformers.RoCBertForTokenClassification
     - forward
 
 
-## RocBertForQuestionAnswering
+## RoCBertForQuestionAnswering
 
-[[autodoc]] RocBertForQuestionAnswering
+[[autodoc]] RoCBertForQuestionAnswering
     - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 25b3572d5f304..6e73b0e68caa7 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -315,7 +315,7 @@
     "models.resnet": ["RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ResNetConfig"],
     "models.retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig", "RetriBertTokenizer"],
     "models.roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig", "RobertaTokenizer"],
-    "models.roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RocBertConfig", "RocBertTokenizer"],
+    "models.roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoCBertConfig", "RoCBertTokenizer"],
     "models.roformer": ["ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoFormerConfig", "RoFormerTokenizer"],
     "models.segformer": ["SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SegformerConfig"],
     "models.sew": ["SEW_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWConfig"],
@@ -845,16 +845,16 @@
     _import_structure["models.roc_bert"].extend(
         [
             "ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "RocBertForMaskedLM",
-            "RocBertForCausalLM",
-            "RocBertForMultipleChoice",
-            "RocBertForQuestionAnswering",
-            "RocBertForSequenceClassification",
-            "RocBertForTokenClassification",
-            "RocBertLayer",
-            "RocBertModel",
-            "RocBertForPreTraining",
-            "RocBertPreTrainedModel",
+            "RoCBertForMaskedLM",
+            "RoCBertForCausalLM",
+            "RoCBertForMultipleChoice",
+            "RoCBertForQuestionAnswering",
+            "RoCBertForSequenceClassification",
+            "RoCBertForTokenClassification",
+            "RoCBertLayer",
+            "RoCBertModel",
+            "RoCBertForPreTraining",
+            "RoCBertPreTrainedModel",
             "load_tf_weights_in_roc_bert",
         ]
     )
@@ -3375,7 +3375,7 @@
     from .models.resnet import RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ResNetConfig
     from .models.retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig, RetriBertTokenizer
     from .models.roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaTokenizer
-    from .models.roc_bert import ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RocBertConfig, RocBertTokenizer
+    from .models.roc_bert import ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RoCBertConfig, RoCBertTokenizer
     from .models.roformer import ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, RoFormerConfig, RoFormerTokenizer
     from .models.segformer import SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, SegformerConfig
     from .models.sew import SEW_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWConfig
@@ -3822,6 +3822,8 @@
         )
         from .generation_utils import top_k_top_p_filtering
         from .modeling_utils import PreTrainedModel
+
+        # PyTorch model imports
         from .models.albert import (
             ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             AlbertForMaskedLM,
@@ -4637,20 +4639,18 @@
             RobertaModel,
             RobertaPreTrainedModel,
         )
-
-        # PyTorch model imports
         from .models.roc_bert import (
             ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-            RocBertForCausalLM,
-            RocBertForMaskedLM,
-            RocBertForMultipleChoice,
-            RocBertForPreTraining,
-            RocBertForQuestionAnswering,
-            RocBertForSequenceClassification,
-            RocBertForTokenClassification,
-            RocBertLayer,
-            RocBertModel,
-            RocBertPreTrainedModel,
+            RoCBertForCausalLM,
+            RoCBertForMaskedLM,
+            RoCBertForMultipleChoice,
+            RoCBertForPreTraining,
+            RoCBertForQuestionAnswering,
+            RoCBertForSequenceClassification,
+            RoCBertForTokenClassification,
+            RoCBertLayer,
+            RoCBertModel,
+            RoCBertPreTrainedModel,
             load_tf_weights_in_roc_bert,
         )
         from .models.roformer import (
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index f6b038b796880..d2680c13dce7d 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -122,7 +122,7 @@
         ("resnet", "ResNetConfig"),
         ("retribert", "RetriBertConfig"),
         ("roberta", "RobertaConfig"),
-        ("roc_bert", "RocBertConfig"),
+        ("roc_bert", "RoCBertConfig"),
         ("roformer", "RoFormerConfig"),
         ("segformer", "SegformerConfig"),
         ("sew", "SEWConfig"),
@@ -408,7 +408,7 @@
         ("resnet", "ResNet"),
         ("retribert", "RetriBERT"),
         ("roberta", "RoBERTa"),
-        ("roc_bert", "RocBert"),
+        ("roc_bert", "RoCBert"),
         ("roformer", "RoFormer"),
         ("segformer", "SegFormer"),
         ("sew", "SEW"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 6765015b583a8..569aa82531863 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -120,7 +120,7 @@
         ("resnet", "ResNetModel"),
         ("retribert", "RetriBertModel"),
         ("roberta", "RobertaModel"),
-        ("roc_bert", "RocBertModel"),
+        ("roc_bert", "RoCBertModel"),
         ("roformer", "RoFormerModel"),
         ("segformer", "SegformerModel"),
         ("sew", "SEWModel"),
@@ -197,7 +197,7 @@
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
         ("retribert", "RetriBertModel"),
         ("roberta", "RobertaForMaskedLM"),
-        ("roc_bert", "RocBertForPreTraining"),
+        ("roc_bert", "RoCBertForPreTraining"),
         ("splinter", "SplinterForPreTraining"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("t5", "T5ForConditionalGeneration"),
@@ -270,7 +270,7 @@
         ("reformer", "ReformerModelWithLMHead"),
         ("rembert", "RemBertForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
-        ("roc_bert", "RocBertForMaskedLM"),
+        ("roc_bert", "RoCBertForMaskedLM"),
         ("roformer", "RoFormerForMaskedLM"),
         ("speech_to_text", "Speech2TextForConditionalGeneration"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
@@ -322,7 +322,7 @@
         ("reformer", "ReformerModelWithLMHead"),
         ("rembert", "RemBertForCausalLM"),
         ("roberta", "RobertaForCausalLM"),
-        ("roc_bert", "RocBertForCausalLM"),
+        ("roc_bert", "RoCBertForCausalLM"),
         ("roformer", "RoFormerForCausalLM"),
         ("speech_to_text_2", "Speech2Text2ForCausalLM"),
         ("transfo-xl", "TransfoXLLMHeadModel"),
@@ -456,7 +456,7 @@
         ("reformer", "ReformerForMaskedLM"),
         ("rembert", "RemBertForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
-        ("roc_bert", "RocBertForMaskedLM"),
+        ("roc_bert", "RoCBertForMaskedLM"),
         ("roformer", "RoFormerForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("tapas", "TapasForMaskedLM"),
@@ -577,7 +577,7 @@
         ("reformer", "ReformerForSequenceClassification"),
         ("rembert", "RemBertForSequenceClassification"),
         ("roberta", "RobertaForSequenceClassification"),
-        ("roc_bert", "RocBertForSequenceClassification"),
+        ("roc_bert", "RoCBertForSequenceClassification"),
         ("roformer", "RoFormerForSequenceClassification"),
         ("squeezebert", "SqueezeBertForSequenceClassification"),
         ("tapas", "TapasForSequenceClassification"),
@@ -633,7 +633,7 @@
         ("reformer", "ReformerForQuestionAnswering"),
         ("rembert", "RemBertForQuestionAnswering"),
         ("roberta", "RobertaForQuestionAnswering"),
-        ("roc_bert", "RocBertForQuestionAnswering"),
+        ("roc_bert", "RoCBertForQuestionAnswering"),
         ("roformer", "RoFormerForQuestionAnswering"),
         ("splinter", "SplinterForQuestionAnswering"),
         ("squeezebert", "SqueezeBertForQuestionAnswering"),
@@ -703,7 +703,7 @@
         ("qdqbert", "QDQBertForTokenClassification"),
         ("rembert", "RemBertForTokenClassification"),
         ("roberta", "RobertaForTokenClassification"),
-        ("roc_bert", "RocBertForTokenClassification"),
+        ("roc_bert", "RoCBertForTokenClassification"),
         ("roformer", "RoFormerForTokenClassification"),
         ("squeezebert", "SqueezeBertForTokenClassification"),
         ("xlm", "XLMForTokenClassification"),
@@ -742,7 +742,7 @@
         ("qdqbert", "QDQBertForMultipleChoice"),
         ("rembert", "RemBertForMultipleChoice"),
         ("roberta", "RobertaForMultipleChoice"),
-        ("roc_bert", "RocBertForMultipleChoice"),
+        ("roc_bert", "RoCBertForMultipleChoice"),
         ("roformer", "RoFormerForMultipleChoice"),
         ("squeezebert", "SqueezeBertForMultipleChoice"),
         ("xlm", "XLMForMultipleChoice"),
diff --git a/src/transformers/models/roc_bert/__init__.py b/src/transformers/models/roc_bert/__init__.py
index d30edab72fed0..a19398dfb8454 100644
--- a/src/transformers/models/roc_bert/__init__.py
+++ b/src/transformers/models/roc_bert/__init__.py
@@ -22,8 +22,8 @@
 
 
 _import_structure = {
-    "configuration_roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RocBertConfig"],
-    "tokenization_roc_bert": ["RocBertTokenizer"],
+    "configuration_roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoCBertConfig"],
+    "tokenization_roc_bert": ["RoCBertTokenizer"],
 }
 
 try:
@@ -42,22 +42,22 @@
 else:
     _import_structure["modeling_roc_bert"] = [
         "ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "RocBertForCausalLM",
-        "RocBertForMaskedLM",
-        "RocBertForMultipleChoice",
-        "RocBertForPreTraining",
-        "RocBertForQuestionAnswering",
-        "RocBertForSequenceClassification",
-        "RocBertForTokenClassification",
-        "RocBertLayer",
-        "RocBertModel",
-        "RocBertPreTrainedModel",
+        "RoCBertForCausalLM",
+        "RoCBertForMaskedLM",
+        "RoCBertForMultipleChoice",
+        "RoCBertForPreTraining",
+        "RoCBertForQuestionAnswering",
+        "RoCBertForSequenceClassification",
+        "RoCBertForTokenClassification",
+        "RoCBertLayer",
+        "RoCBertModel",
+        "RoCBertPreTrainedModel",
         "load_tf_weights_in_roc_bert",
     ]
 
 if TYPE_CHECKING:
-    from .configuration_roc_bert import ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RocBertConfig
-    from .tokenization_roc_bert import RocBertTokenizer
+    from .configuration_roc_bert import ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RoCBertConfig
+    from .tokenization_roc_bert import RoCBertTokenizer
 
     try:
         if not is_tokenizers_available():
@@ -75,16 +75,16 @@
     else:
         from .modeling_roc_bert import (
             ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-            RocBertForCausalLM,
-            RocBertForMaskedLM,
-            RocBertForMultipleChoice,
-            RocBertForPreTraining,
-            RocBertForQuestionAnswering,
-            RocBertForSequenceClassification,
-            RocBertForTokenClassification,
-            RocBertLayer,
-            RocBertModel,
-            RocBertPreTrainedModel,
+            RoCBertForCausalLM,
+            RoCBertForMaskedLM,
+            RoCBertForMultipleChoice,
+            RoCBertForPreTraining,
+            RoCBertForQuestionAnswering,
+            RoCBertForSequenceClassification,
+            RoCBertForTokenClassification,
+            RoCBertLayer,
+            RoCBertModel,
+            RoCBertPreTrainedModel,
             load_tf_weights_in_roc_bert,
         )
 
diff --git a/src/transformers/models/roc_bert/configuration_roc_bert.py b/src/transformers/models/roc_bert/configuration_roc_bert.py
index 885686402c496..5e7c45b294e70 100644
--- a/src/transformers/models/roc_bert/configuration_roc_bert.py
+++ b/src/transformers/models/roc_bert/configuration_roc_bert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" RocBert model configuration"""
+""" RoCBert model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -22,15 +22,15 @@
 
 ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "weiweishi/roc-bert-base-zh": "https://huggingface.co/weiweishi/roc-bert-base-zh/resolve/main/config.json",
-    # See all RocBert models at https://huggingface.co/models?filter=roc_bert
+    # See all RoCBert models at https://huggingface.co/models?filter=roc_bert
 }
 
 
-class RocBertConfig(PretrainedConfig):
+class RoCBertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`~RocBertModel`]. It is used to instantiate an
-    RocBert model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the RocBert
+    This is the configuration class to store the configuration of a [`RoCBertModel`]. It is used to instantiate an
+    RoCBert model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the RoCBert
     [weiweishi/roc-bert-base-zh](https://huggingface.co/weiweishi/roc-bert-base-zh) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -39,8 +39,8 @@ class RocBertConfig(PretrainedConfig):
 
     Args:
         vocab_size (`int`, *optional*, defaults to 30522):
-            Vocabulary size of the RocBert model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`~RocBertModel`] or [`~TFRocBertModel`].
+            Vocabulary size of the RoCBert model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`RoCBertModel`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -60,7 +60,7 @@ class RocBertConfig(PretrainedConfig):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`~RocBertModel`] or [`~TFRocBertModel`].
+            The vocabulary size of the `token_type_ids` passed when calling [`RoCBertModel`].
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
@@ -77,23 +77,23 @@ class RocBertConfig(PretrainedConfig):
         pronunciation_embed_dim (`int`, *optional*, defaults to 768):
             Dimension of the pronunciation_embed.
         pronunciation_vocab_size (`int`, *optional*, defaults to 910):
-            Pronunciation Vocabulary size of the RocBert model. Defines the number of different tokens that can be
-            represented by the `input_pronunciation_ids` passed when calling [`~RocBertModel`].
+            Pronunciation Vocabulary size of the RoCBert model. Defines the number of different tokens that can be
+            represented by the `input_pronunciation_ids` passed when calling [`RoCBertModel`].
         shape_embed_dim (`int`, *optional*, defaults to 512):
             Dimension of the shape_embed.
         shape_vocab_size (`int`, *optional*, defaults to 24858):
-            Shape Vocabulary size of the RocBert model. Defines the number of different tokens that can be represented
-            by the `input_shape_ids` passed when calling [`~RocBertModel`].
+            Shape Vocabulary size of the RoCBert model. Defines the number of different tokens that can be represented
+            by the `input_shape_ids` passed when calling [`RoCBertModel`].
         Example:
 
     ```python
-    >>> from transformers import RocBertModel, RocBertConfig
+    >>> from transformers import RoCBertModel, RoCBertConfig
 
-    >>> # Initializing a RocBert weiweishi/roc-bert-base-zh style configuration
-    >>> configuration = RocBertConfig()
+    >>> # Initializing a RoCBert weiweishi/roc-bert-base-zh style configuration
+    >>> configuration = RoCBertConfig()
 
     >>> # Initializing a model from the weiweishi/roc-bert-base-zh style configuration
-    >>> model = RocBertModel(configuration)
+    >>> model = RoCBertModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index adafd037c9db6..ce1269193e1b0 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch RocBert model."""
+""" PyTorch RoCBert model."""
 
 import math
 import os
@@ -43,18 +43,18 @@
     logging,
     replace_return_docstrings,
 )
-from .configuration_roc_bert import RocBertConfig
+from .configuration_roc_bert import RoCBertConfig
 
 
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "weiweishi/roc-bert-base-zh"
-_CONFIG_FOR_DOC = "RocBertConfig"
-_TOKENIZER_FOR_DOC = "RocBertTokenizer"
+_CONFIG_FOR_DOC = "RoCBertConfig"
+_TOKENIZER_FOR_DOC = "RoCBertTokenizer"
 
 ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "weiweishi/roc-bert-base-zh",
-    # See all RocBert models at https://huggingface.co/models?filter=roc_bert
+    # See all RoCBert models at https://huggingface.co/models?filter=roc_bert
 ]
 
 
@@ -132,7 +132,7 @@ def load_tf_weights_in_roc_bert(model, config, tf_checkpoint_path):
     return model
 
 
-class RocBertEmbeddings(nn.Module):
+class RoCBertEmbeddings(nn.Module):
     """Construct the embeddings from word, position, shape, pronunciation and token_type embeddings."""
 
     def __init__(self, config):
@@ -261,8 +261,8 @@ def forward(
             return embedding_in
 
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->RocBert
-class RocBertSelfAttention(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->RoCBert
+class RoCBertSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
@@ -362,7 +362,7 @@ def forward(
 
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in RocBertModel forward() function)
+            # Apply the attention mask is (precomputed for all layers in RoCBertModel forward() function)
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
@@ -389,8 +389,8 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->RocBert
-class RocBertSelfOutput(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->RoCBert
+class RoCBertSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -404,12 +404,12 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->RocBert
-class RocBertAttention(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->RoCBert
+class RoCBertAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = RocBertSelfAttention(config, position_embedding_type=position_embedding_type)
-        self.output = RocBertSelfOutput(config)
+        self.self = RoCBertSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = RoCBertSelfOutput(config)
         self.pruned_heads = set()
 
     def prune_heads(self, heads):
@@ -454,8 +454,8 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->RocBert
-class RocBertIntermediate(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->RoCBert
+class RoCBertIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
@@ -470,8 +470,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->RocBert
-class RocBertOutput(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->RoCBert
+class RoCBertOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
@@ -485,21 +485,21 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->RocBert
-class RocBertLayer(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->RoCBert
+class RoCBertLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = RocBertAttention(config)
+        self.attention = RoCBertAttention(config)
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         if self.add_cross_attention:
             if not self.is_decoder:
                 raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
-            self.crossattention = RocBertAttention(config, position_embedding_type="absolute")
-        self.intermediate = RocBertIntermediate(config)
-        self.output = RocBertOutput(config)
+            self.crossattention = RoCBertAttention(config, position_embedding_type="absolute")
+        self.intermediate = RoCBertIntermediate(config)
+        self.output = RoCBertOutput(config)
 
     def forward(
         self,
@@ -572,12 +572,12 @@ def feed_forward_chunk(self, attention_output):
         return layer_output
 
 
-# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->RocBert
-class RocBertEncoder(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->RoCBert
+class RoCBertEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
-        self.layer = nn.ModuleList([RocBertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList([RoCBertLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
     def forward(
@@ -670,8 +670,8 @@ def custom_forward(*inputs):
         )
 
 
-# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->RocBert
-class RocBertPooler(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->RoCBert
+class RoCBertPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -686,8 +686,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return pooled_output
 
 
-# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->RocBert
-class RocBertPredictionHeadTransform(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->RoCBert
+class RoCBertPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -704,11 +704,11 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->RocBert
-class RocBertLMPredictionHead(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->RoCBert
+class RoCBertLMPredictionHead(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.transform = RocBertPredictionHeadTransform(config)
+        self.transform = RoCBertPredictionHeadTransform(config)
 
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
@@ -725,25 +725,25 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->RocBert
-class RocBertOnlyMLMHead(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->RoCBert
+class RoCBertOnlyMLMHead(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.predictions = RocBertLMPredictionHead(config)
+        self.predictions = RoCBertLMPredictionHead(config)
 
     def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
         prediction_scores = self.predictions(sequence_output)
         return prediction_scores
 
 
-# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel with Bert->RocBert,bert->roc_bert
-class RocBertPreTrainedModel(PreTrainedModel):
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel with Bert->RoCBert,bert->roc_bert
+class RoCBertPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
 
-    config_class = RocBertConfig
+    config_class = RoCBertConfig
     load_tf_weights = load_tf_weights_in_roc_bert
     base_model_prefix = "roc_bert"
     supports_gradient_checkpointing = True
@@ -766,7 +766,7 @@ def _init_weights(self, module):
             module.weight.data.fill_(1.0)
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, RocBertEncoder):
+        if isinstance(module, RoCBertEncoder):
             module.gradient_checkpointing = value
 
 
@@ -776,7 +776,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     behavior.
 
     Parameters:
-        config ([`~RocBertConfig`]): Model configuration class with all the parameters of the model.
+        config ([`RoCBertConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
@@ -786,21 +786,21 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RocBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`RoCBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
         input_shape_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the shape vocabulary.
 
-            Indices can be obtained using [`RocBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`RoCBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input_shape_ids)
         input_pronunciation_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the pronunciation vocabulary.
 
-            Indices can be obtained using [`RocBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`RoCBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input_pronunciation_ids)
@@ -846,10 +846,10 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 
 @add_start_docstrings(
-    "The bare RocBert Model transformer outputting raw hidden-states without any specific head on top.",
+    "The bare RoCBert Model transformer outputting raw hidden-states without any specific head on top.",
     ROC_BERT_START_DOCSTRING,
 )
-class RocBertModel(RocBertPreTrainedModel):
+class RoCBertModel(RoCBertPreTrainedModel):
     """
 
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
@@ -862,15 +862,15 @@ class RocBertModel(RocBertPreTrainedModel):
     `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->RocBert
+    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->RoCBert
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
 
-        self.embeddings = RocBertEmbeddings(config)
-        self.encoder = RocBertEncoder(config)
+        self.embeddings = RoCBertEmbeddings(config)
+        self.encoder = RoCBertEncoder(config)
 
-        self.pooler = RocBertPooler(config) if add_pooling_layer else None
+        self.pooler = RoCBertPooler(config) if add_pooling_layer else None
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1046,16 +1046,16 @@ def forward(
 
 @add_start_docstrings(
     """
-    RocBert Model with contrastive loss and masked_lm_loss during the pretraining.
+    RoCBert Model with contrastive loss and masked_lm_loss during the pretraining.
     """,
     ROC_BERT_START_DOCSTRING,
 )
-class RocBertForPreTraining(RocBertPreTrainedModel):
+class RoCBertForPreTraining(RoCBertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
-        self.roc_bert = RocBertModel(config)
-        self.cls = RocBertOnlyMLMHead(config)
+        self.roc_bert = RoCBertModel(config)
+        self.cls = RoCBertOnlyMLMHead(config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1130,11 +1130,11 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import RocBertTokenizer, RocBertForPreTraining
+        >>> from transformers import RoCBertTokenizer, RoCBertForPreTraining
         >>> import torch
 
-        >>> tokenizer = RocBertTokenizer.from_pretrained("weiweishi/roc-bert-base-zh")
-        >>> model = RocBertForPreTraining.from_pretrained("weiweishi/roc-bert-base-zh")
+        >>> tokenizer = RoCBertTokenizer.from_pretrained("weiweishi/roc-bert-base-zh")
+        >>> model = RoCBertForPreTraining.from_pretrained("weiweishi/roc-bert-base-zh")
 
         >>> inputs = tokenizer("你好，很高兴认识你", return_tensors="pt")
         >>> attack_inputs = tokenizer("你号，很高兴认识你", return_tensors="pt")
@@ -1232,23 +1232,23 @@ def forward(
         )
 
 
-@add_start_docstrings("""RocBert Model with a `language modeling` head on top.""", ROC_BERT_START_DOCSTRING)
-class RocBertForMaskedLM(RocBertPreTrainedModel):
+@add_start_docstrings("""RoCBert Model with a `language modeling` head on top.""", ROC_BERT_START_DOCSTRING)
+class RoCBertForMaskedLM(RoCBertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
 
-    # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->RocBert,bert->roc_bert
+    # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->RoCBert,bert->roc_bert
     def __init__(self, config):
         super().__init__(config)
 
         if config.is_decoder:
             logger.warning(
-                "If you want to use `RocBertForMaskedLM` make sure `config.is_decoder=False` for "
+                "If you want to use `RoCBertForMaskedLM` make sure `config.is_decoder=False` for "
                 "bi-directional self-attention."
             )
 
-        self.roc_bert = RocBertModel(config, add_pooling_layer=False)
-        self.cls = RocBertOnlyMLMHead(config)
+        self.roc_bert = RoCBertModel(config, add_pooling_layer=False)
+        self.cls = RoCBertOnlyMLMHead(config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1355,21 +1355,21 @@ def prepare_inputs_for_generation(
 
 
 @add_start_docstrings(
-    """RocBert Model with a `language modeling` head on top for CLM fine-tuning.""", ROC_BERT_START_DOCSTRING
+    """RoCBert Model with a `language modeling` head on top for CLM fine-tuning.""", ROC_BERT_START_DOCSTRING
 )
-class RocBertForCausalLM(RocBertPreTrainedModel):
+class RoCBertForCausalLM(RoCBertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
 
-    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->RocBertForCausalLM,Bert->RocBert,bert->roc_bert
+    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->RoCBertForCausalLM,Bert->RoCBert,bert->roc_bert
     def __init__(self, config):
         super().__init__(config)
 
         if not config.is_decoder:
-            logger.warning("If you want to use `RocRocBertForCausalLM` as a standalone, add `is_decoder=True.`")
+            logger.warning("If you want to use `RoCRoCBertForCausalLM` as a standalone, add `is_decoder=True.`")
 
-        self.roc_bert = RocBertModel(config, add_pooling_layer=False)
-        self.cls = RocBertOnlyMLMHead(config)
+        self.roc_bert = RoCBertModel(config, add_pooling_layer=False)
+        self.cls = RoCBertOnlyMLMHead(config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1438,13 +1438,13 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import RocBertTokenizer, RocBertForCausalLM, RocBertConfig
+        >>> from transformers import RoCBertTokenizer, RoCBertForCausalLM, RoCBertConfig
         >>> import torch
 
-        >>> tokenizer = RocBertTokenizer.from_pretrained("weiweishi/roc-bert-base-zh")
-        >>> config = RocBertConfig.from_pretrained("weiweishi/roc-bert-base-zh")
+        >>> tokenizer = RoCBertTokenizer.from_pretrained("weiweishi/roc-bert-base-zh")
+        >>> config = RoCBertConfig.from_pretrained("weiweishi/roc-bert-base-zh")
         >>> config.is_decoder = True
-        >>> model = RocBertForCausalLM.from_pretrained("weiweishi/roc-bert-base-zh", config=config)
+        >>> model = RoCBertForCausalLM.from_pretrained("weiweishi/roc-bert-base-zh", config=config)
 
         >>> inputs = tokenizer("你好，很高兴认识你", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1535,18 +1535,18 @@ def _reorder_cache(self, past, beam_idx):
 
 
 @add_start_docstrings(
-    """RocBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    """RoCBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks.""",
     ROC_BERT_START_DOCSTRING,
 )
-class RocBertForSequenceClassification(RocBertPreTrainedModel):
-    # Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification.__init__ with Bert->RocBert,bert->roc_bert
+class RoCBertForSequenceClassification(RoCBertPreTrainedModel):
+    # Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification.__init__ with Bert->RoCBert,bert->roc_bert
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.config = config
 
-        self.roc_bert = RocBertModel(config)
+        self.roc_bert = RoCBertModel(config)
         classifier_dropout = (
             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
         )
@@ -1640,16 +1640,16 @@ def forward(
 
 
 @add_start_docstrings(
-    """RocBert Model with a multiple choice classification head on top (a linear layer on top of
+    """RoCBert Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
     ROC_BERT_START_DOCSTRING,
 )
-class RocBertForMultipleChoice(RocBertPreTrainedModel):
-    # Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice.__init__ with Bert->RocBert,bert->roc_bert
+class RoCBertForMultipleChoice(RoCBertPreTrainedModel):
+    # Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice.__init__ with Bert->RoCBert,bert->roc_bert
     def __init__(self, config):
         super().__init__(config)
 
-        self.roc_bert = RocBertModel(config)
+        self.roc_bert = RoCBertModel(config)
         classifier_dropout = (
             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
         )
@@ -1746,19 +1746,19 @@ def forward(
 
 
 @add_start_docstrings(
-    """RocBert Model with a token classification head on top (a linear layer on top of
+    """RoCBert Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""",
     ROC_BERT_START_DOCSTRING,
 )
-class RocBertForTokenClassification(RocBertPreTrainedModel):
+class RoCBertForTokenClassification(RoCBertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
-    # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->RocBert,bert->roc_bert
+    # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->RoCBert,bert->roc_bert
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.roc_bert = RocBertModel(config, add_pooling_layer=False)
+        self.roc_bert = RoCBertModel(config, add_pooling_layer=False)
         classifier_dropout = (
             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
         )
@@ -1833,19 +1833,19 @@ def forward(
 
 
 @add_start_docstrings(
-    """RocBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    """RoCBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
     layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
     ROC_BERT_START_DOCSTRING,
 )
-class RocBertForQuestionAnswering(RocBertPreTrainedModel):
+class RoCBertForQuestionAnswering(RoCBertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
-    # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->RocBert,bert->roc_bert
+    # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->RoCBert,bert->roc_bert
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.roc_bert = RocBertModel(config, add_pooling_layer=False)
+        self.roc_bert = RoCBertModel(config, add_pooling_layer=False)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
         # Initialize weights and apply final processing
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
index fd82c51d984f1..a91421bad7577 100644
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for RocBert."""
+"""Tokenization classes for RoCBert."""
 
 import collections
 import itertools
@@ -92,7 +92,7 @@ def whitespace_tokenize(text):
     return tokens
 
 
-class RocBertTokenizer(PreTrainedTokenizer):
+class RoCBertTokenizer(PreTrainedTokenizer):
     r"""
     Args:
     Construct a RocBERT tokenizer. Based on WordPiece. This tokenizer inherits from [`PreTrainedTokenizer`] which
@@ -174,7 +174,7 @@ def __init__(
             if cur_file is None or not os.path.isfile(cur_file):
                 raise ValueError(
                     f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google "
-                    "pretrained model use `tokenizer = RocBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+                    "pretrained model use `tokenizer = RoCBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
                 )
 
         self.vocab = load_vocab(vocab_file)
@@ -189,13 +189,13 @@ def __init__(
 
         self.do_basic_tokenize = do_basic_tokenize
         if do_basic_tokenize:
-            self.basic_tokenizer = RocBertBasicTokenizer(
+            self.basic_tokenizer = RoCBertBasicTokenizer(
                 do_lower_case=do_lower_case,
                 never_split=never_split,
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = RocBertWordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+        self.wordpiece_tokenizer = RoCBertWordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
 
     @property
     def do_lower_case(self):
@@ -886,7 +886,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         else:
             raise ValueError(
                 f"Can't find a directory at path '{save_directory}'. To load the vocabulary from a Google "
-                "pretrained model use `tokenizer = RocBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+                "pretrained model use `tokenizer = RoCBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
             )
 
         with open(vocab_file, "w", encoding="utf-8") as writer:
@@ -913,10 +913,10 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         )
 
 
-# Copied from  transformers.models.bert.tokenization_bert.BasicTokenizer with BasicTokenizer->RocBertBasicTokenizer
-class RocBertBasicTokenizer(object):
+# Copied from  transformers.models.bert.tokenization_bert.BasicTokenizer with BasicTokenizer->RoCBertBasicTokenizer
+class RoCBertBasicTokenizer(object):
     """
-    Constructs a RocBertBasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+    Constructs a RoCBertBasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
 
     Args:
         do_lower_case (`bool`, *optional*, defaults to `True`):
@@ -1063,8 +1063,8 @@ def _clean_text(self, text):
         return "".join(output)
 
 
-# Copied from  transformers.models.bert.tokenization_bert.WordpieceTokenizer with WordpieceTokenizer->RocBertWordpieceTokenizer
-class RocBertWordpieceTokenizer(object):
+# Copied from  transformers.models.bert.tokenization_bert.WordpieceTokenizer with WordpieceTokenizer->RoCBertWordpieceTokenizer
+class RoCBertWordpieceTokenizer(object):
     """Runs WordPiece tokenization."""
 
     def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 3ddc8b7fdcce3..f56f2ec835040 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4484,70 +4484,70 @@ def __init__(self, *args, **kwargs):
 ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class RocBertForCausalLM(metaclass=DummyObject):
+class RoCBertForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RocBertForMaskedLM(metaclass=DummyObject):
+class RoCBertForMaskedLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RocBertForMultipleChoice(metaclass=DummyObject):
+class RoCBertForMultipleChoice(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RocBertForPreTraining(metaclass=DummyObject):
+class RoCBertForPreTraining(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RocBertForQuestionAnswering(metaclass=DummyObject):
+class RoCBertForQuestionAnswering(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RocBertForSequenceClassification(metaclass=DummyObject):
+class RoCBertForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RocBertForTokenClassification(metaclass=DummyObject):
+class RoCBertForTokenClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RocBertLayer(metaclass=DummyObject):
+class RoCBertLayer(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RocBertModel(metaclass=DummyObject):
+class RoCBertModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RocBertPreTrainedModel(metaclass=DummyObject):
+class RoCBertPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py
index 3de5740cdc9f9..9f9ea43faf1d5 100644
--- a/tests/models/roc_bert/test_modeling_roc_bert.py
+++ b/tests/models/roc_bert/test_modeling_roc_bert.py
@@ -12,11 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch RocBert model. """
+""" Testing suite for the PyTorch RoCBert model. """
 
 import unittest
 
-from transformers import RocBertConfig, is_torch_available
+from transformers import RoCBertConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 
@@ -29,19 +29,19 @@
 
     from transformers import (
         MODEL_FOR_PRETRAINING_MAPPING,
-        RocBertForCausalLM,
-        RocBertForMaskedLM,
-        RocBertForMultipleChoice,
-        RocBertForPreTraining,
-        RocBertForQuestionAnswering,
-        RocBertForSequenceClassification,
-        RocBertForTokenClassification,
-        RocBertModel,
+        RoCBertForCausalLM,
+        RoCBertForMaskedLM,
+        RoCBertForMultipleChoice,
+        RoCBertForPreTraining,
+        RoCBertForQuestionAnswering,
+        RoCBertForSequenceClassification,
+        RoCBertForTokenClassification,
+        RoCBertModel,
     )
     from transformers.models.roc_bert.modeling_roc_bert import ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
-class RocBertModelTester:
+class RoCBertModelTester:
     def __init__(
         self,
         parent,
@@ -134,7 +134,7 @@ def prepare_config_and_inputs(self):
         )
 
     def get_config(self):
-        return RocBertConfig(
+        return RoCBertConfig(
             vocab_size=self.vocab_size,
             shape_vocab_size=self.shape_vocab_size,
             pronunciation_vocab_size=self.pronunciation_vocab_size,
@@ -196,7 +196,7 @@ def create_and_check_model(
         token_labels,
         choice_labels,
     ):
-        model = RocBertModel(config=config)
+        model = RoCBertModel(config=config)
         model.to(torch_device)
         model.eval()
         result = model(
@@ -230,7 +230,7 @@ def create_and_check_model_as_decoder(
         encoder_attention_mask,
     ):
         config.add_cross_attention = True
-        model = RocBertModel(config)
+        model = RoCBertModel(config)
         model.to(torch_device)
         model.eval()
         result = model(
@@ -273,7 +273,7 @@ def create_and_check_for_causal_lm(
         encoder_hidden_states,
         encoder_attention_mask,
     ):
-        model = RocBertForCausalLM(config=config)
+        model = RoCBertForCausalLM(config=config)
         model.to(torch_device)
         model.eval()
         result = model(
@@ -298,7 +298,7 @@ def create_and_check_for_masked_lm(
         token_labels,
         choice_labels,
     ):
-        model = RocBertForMaskedLM(config=config)
+        model = RoCBertForMaskedLM(config=config)
         model.to(torch_device)
         model.eval()
         result = model(
@@ -327,7 +327,7 @@ def create_and_check_decoder_model_past_large_inputs(
     ):
         config.is_decoder = True
         config.add_cross_attention = True
-        model = RocBertForCausalLM(config=config)
+        model = RoCBertForCausalLM(config=config)
         model.to(torch_device)
         model.eval()
 
@@ -397,7 +397,7 @@ def create_and_check_for_question_answering(
         token_labels,
         choice_labels,
     ):
-        model = RocBertForQuestionAnswering(config=config)
+        model = RoCBertForQuestionAnswering(config=config)
         model.to(torch_device)
         model.eval()
         result = model(
@@ -425,7 +425,7 @@ def create_and_check_for_sequence_classification(
         choice_labels,
     ):
         config.num_labels = self.num_labels
-        model = RocBertForSequenceClassification(config)
+        model = RoCBertForSequenceClassification(config)
         model.to(torch_device)
         model.eval()
         result = model(
@@ -451,7 +451,7 @@ def create_and_check_for_token_classification(
         choice_labels,
     ):
         config.num_labels = self.num_labels
-        model = RocBertForTokenClassification(config=config)
+        model = RoCBertForTokenClassification(config=config)
         model.to(torch_device)
         model.eval()
         result = model(
@@ -477,7 +477,7 @@ def create_and_check_for_multiple_choice(
         choice_labels,
     ):
         config.num_choices = self.num_choices
-        model = RocBertForMultipleChoice(config=config)
+        model = RoCBertForMultipleChoice(config=config)
         model.to(torch_device)
         model.eval()
         multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
@@ -531,7 +531,7 @@ def create_and_check_for_pretraining(
         token_labels,
         choice_labels,
     ):
-        model = RocBertForPreTraining(config=config)
+        model = RoCBertForPreTraining(config=config)
         model.to(torch_device)
         model.eval()
         result = model(
@@ -555,22 +555,22 @@ def create_and_check_for_pretraining(
 
 
 @require_torch
-class RocBertModelTest(ModelTesterMixin, unittest.TestCase):
+class RoCBertModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
-            RocBertModel,
-            RocBertForMaskedLM,
-            RocBertForCausalLM,
-            RocBertForMultipleChoice,
-            RocBertForQuestionAnswering,
-            RocBertForSequenceClassification,
-            RocBertForTokenClassification,
-            RocBertForPreTraining,
+            RoCBertModel,
+            RoCBertForMaskedLM,
+            RoCBertForCausalLM,
+            RoCBertForMultipleChoice,
+            RoCBertForQuestionAnswering,
+            RoCBertForSequenceClassification,
+            RoCBertForTokenClassification,
+            RoCBertForPreTraining,
         )
         if is_torch_available()
         else ()
     )
-    all_generative_model_classes = (RocBertForCausalLM,) if is_torch_available() else ()
+    all_generative_model_classes = (RoCBertForCausalLM,) if is_torch_available() else ()
 
     # special case for ForPreTraining model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
@@ -599,8 +599,8 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         return inputs_dict
 
     def setUp(self):
-        self.model_tester = RocBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RocBertConfig, hidden_size=37)
+        self.model_tester = RoCBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=RoCBertConfig, hidden_size=37)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -682,15 +682,15 @@ def test_model_as_decoder_with_default_input_mask(self):
     @slow
     def test_model_from_pretrained(self):
         for model_name in ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = RocBertModel.from_pretrained(model_name)
+            model = RoCBertModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
 
 @require_torch
-class RocBertModelIntegrationTest(unittest.TestCase):
+class RoCBertModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_masked_lm(self):
-        model = RocBertForMaskedLM.from_pretrained("weiweishi/roc-bert-base-zh")
+        model = RoCBertForMaskedLM.from_pretrained("weiweishi/roc-bert-base-zh")
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
         output = model(input_ids)[0]
 
diff --git a/tests/models/roc_bert/test_tokenization_roc_bert.py b/tests/models/roc_bert/test_tokenization_roc_bert.py
index 14fdbfbecefc5..778403267427d 100644
--- a/tests/models/roc_bert/test_tokenization_roc_bert.py
+++ b/tests/models/roc_bert/test_tokenization_roc_bert.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,9 +20,9 @@
 
 from transformers.models.roc_bert.tokenization_roc_bert import (
     VOCAB_FILES_NAMES,
-    RocBertBasicTokenizer,
-    RocBertTokenizer,
-    RocBertWordpieceTokenizer,
+    RoCBertBasicTokenizer,
+    RoCBertTokenizer,
+    RoCBertWordpieceTokenizer,
     _is_control,
     _is_punctuation,
     _is_whitespace,
@@ -34,7 +34,7 @@
 
 @require_tokenizers
 class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    tokenizer_class = RocBertTokenizer
+    tokenizer_class = RoCBertTokenizer
     rust_tokenizer_class = None
     test_rust_tokenizer = False
     space_between_special_tokens = True
@@ -43,21 +43,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def setUp(self):
         super().setUp()
 
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "你",
-            "好",
-            "是",
-            "谁",
-            "a",
-            "b",
-            "c",
-            "d",
-        ]
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "你", "好", "是", "谁", "a", "b", "c", "d"]
         word_shape = dict()
         word_pronunciation = dict()
         for i, value in enumerate(vocab_tokens):
@@ -82,88 +68,88 @@ def test_full_tokenizer(self):
         self.assertListEqual(tokenizer.convert_tokens_to_shape_ids(tokens), [5, 6, 2, 5, 7, 8])
         self.assertListEqual(tokenizer.convert_tokens_to_pronunciation_ids(tokens), [5, 6, 2, 5, 7, 8])
 
-    # Copied from tests.models.bert.test_tokenization_bert.test_chinese with BasicTokenizer->RocBertBertBasicTokenizer
+    # Copied from tests.models.bert.test_tokenization_bert.test_chinese with BasicTokenizer->RoCBertBertBasicTokenizer
     def test_chinese(self):
-        tokenizer = RocBertBasicTokenizer()
+        tokenizer = RoCBertBasicTokenizer()
 
         self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
 
-    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_lower with BasicTokenizer->RocBertBertBasicTokenizer
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_lower with BasicTokenizer->RoCBertBertBasicTokenizer
     def test_basic_tokenizer_lower(self):
-        tokenizer = RocBertBasicTokenizer(do_lower_case=True)
+        tokenizer = RoCBertBasicTokenizer(do_lower_case=True)
 
         self.assertListEqual(
             tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
         )
         self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
 
-    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_lower_strip_accents_false with BasicTokenizer->RocBertBertBasicTokenizer
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_lower_strip_accents_false with BasicTokenizer->RoCBertBertBasicTokenizer
     def test_basic_tokenizer_lower_strip_accents_false(self):
-        tokenizer = RocBertBasicTokenizer(do_lower_case=True, strip_accents=False)
+        tokenizer = RoCBertBasicTokenizer(do_lower_case=True, strip_accents=False)
 
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
         )
         self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
 
-    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_lower_strip_accents_true with BertBasicTokenizer->RocBertBertBasicTokenizer
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_lower_strip_accents_true with BertBasicTokenizer->RoCBertBertBasicTokenizer
     def test_basic_tokenizer_lower_strip_accents_true(self):
-        tokenizer = RocBertBasicTokenizer(do_lower_case=True, strip_accents=True)
+        tokenizer = RoCBertBasicTokenizer(do_lower_case=True, strip_accents=True)
 
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
         self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
 
-    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_lower_strip_accents_default with BasicTokenizer->RocBertBertBasicTokenizer
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_lower_strip_accents_default with BasicTokenizer->RoCBertBertBasicTokenizer
     def test_basic_tokenizer_lower_strip_accents_default(self):
-        tokenizer = RocBertBasicTokenizer(do_lower_case=True)
+        tokenizer = RoCBertBasicTokenizer(do_lower_case=True)
 
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
         self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
 
-    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_no_lower with BasicTokenizer->RocBertBertBasicTokenizer
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_no_lower with BasicTokenizer->RoCBertBertBasicTokenizer
     def test_basic_tokenizer_no_lower(self):
-        tokenizer = RocBertBasicTokenizer(do_lower_case=False)
+        tokenizer = RoCBertBasicTokenizer(do_lower_case=False)
 
         self.assertListEqual(
             tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
         )
 
-    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_no_lower_strip_accents_false with BertBasicTokenizer->RocBertBertBasicTokenizer
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_no_lower_strip_accents_false with BertBasicTokenizer->RoCBertBertBasicTokenizer
     def test_basic_tokenizer_no_lower_strip_accents_false(self):
-        tokenizer = RocBertBasicTokenizer(do_lower_case=False, strip_accents=False)
+        tokenizer = RoCBertBasicTokenizer(do_lower_case=False, strip_accents=False)
 
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
         )
 
-    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_no_lower_strip_accents_true with BasicTokenizer->RocBertBertBasicTokenizer
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_no_lower_strip_accents_true with BasicTokenizer->RoCBertBertBasicTokenizer
     def test_basic_tokenizer_no_lower_strip_accents_true(self):
-        tokenizer = RocBertBasicTokenizer(do_lower_case=False, strip_accents=True)
+        tokenizer = RoCBertBasicTokenizer(do_lower_case=False, strip_accents=True)
 
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
         )
 
-    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_respects_never_split_tokens with BasicTokenizer->RocBertBertBasicTokenizer
+    # Copied from tests.models.bert.test_tokenization_bert.test_basic_tokenizer_respects_never_split_tokens with BasicTokenizer->RoCBertBertBasicTokenizer
     def test_basic_tokenizer_respects_never_split_tokens(self):
-        tokenizer = RocBertBasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
+        tokenizer = RoCBertBasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
 
         self.assertListEqual(
             tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
         )
 
-    # Copied from tests.models.bert.test_tokenization_bert.test_wordpiece_tokenizer with WordpieceTokenizer->RocBertWordpieceTokenizer
+    # Copied from tests.models.bert.test_tokenization_bert.test_wordpiece_tokenizer with WordpieceTokenizer->RoCBertWordpieceTokenizer
     def test_wordpiece_tokenizer(self):
         vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
 
         vocab = {}
         for i, token in enumerate(vocab_tokens):
             vocab[token] = i
-        tokenizer = RocBertWordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
+        tokenizer = RoCBertWordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
 
         self.assertListEqual(tokenizer.tokenize(""), [])
 

From dec981e578df8ec799f47a4f924ac31ba78d63bc Mon Sep 17 00:00:00 2001
From: weiweishi <weiweishi@tencent.com>
Date: Tue, 8 Nov 2022 20:02:32 +0800
Subject: [PATCH 16/16] add doc, add detail test

---
 .../models/roc_bert/configuration_roc_bert.py  | 15 +++++++++++++--
 .../models/roc_bert/modeling_roc_bert.py       | 10 ++++++----
 .../models/roc_bert/tokenization_roc_bert.py   |  2 +-
 .../models/roc_bert/test_modeling_roc_bert.py  | 18 +++++++++++-------
 utils/documentation_tests.txt                  |  2 ++
 5 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/roc_bert/configuration_roc_bert.py b/src/transformers/models/roc_bert/configuration_roc_bert.py
index 5e7c45b294e70..7ac989d005246 100644
--- a/src/transformers/models/roc_bert/configuration_roc_bert.py
+++ b/src/transformers/models/roc_bert/configuration_roc_bert.py
@@ -22,13 +22,12 @@
 
 ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "weiweishi/roc-bert-base-zh": "https://huggingface.co/weiweishi/roc-bert-base-zh/resolve/main/config.json",
-    # See all RoCBert models at https://huggingface.co/models?filter=roc_bert
 }
 
 
 class RoCBertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`RoCBertModel`]. It is used to instantiate an
+    This is the configuration class to store the configuration of a [`RoCBertModel`]. It is used to instantiate a
     RoCBert model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the RoCBert
     [weiweishi/roc-bert-base-zh](https://huggingface.co/weiweishi/roc-bert-base-zh) architecture.
@@ -68,6 +67,14 @@ class RoCBertConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
         enable_cls (`bool`, *optional*, defaults to `True`):
             Whether or not the model use cls loss when pretrained.
         enable_pronunciation (`bool`, *optional*, defaults to `True`):
@@ -84,6 +91,10 @@ class RoCBertConfig(PretrainedConfig):
         shape_vocab_size (`int`, *optional*, defaults to 24858):
             Shape Vocabulary size of the RoCBert model. Defines the number of different tokens that can be represented
             by the `input_shape_ids` passed when calling [`RoCBertModel`].
+        concat_input (`bool`, *optional*, defaults to `True`):
+            Defines the way of merging the shape_embed, pronunciation_embed and word_embed, if the value is true,
+            output_embed = torch.cat((word_embed, shape_embed, pronunciation_embed), -1), else output_embed =
+            (word_embed + shape_embed + pronunciation_embed) / 3
         Example:
 
     ```python
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index ce1269193e1b0..684ff7db86918 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -858,7 +858,7 @@ class RoCBertModel(RoCBertPreTrainedModel):
     Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
 
     To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
-    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    to `True`. To be used in a Seq2Seq model, the model needs to be initialized with both `is_decoder` argument and
     `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
 
@@ -1182,10 +1182,10 @@ def forward(
                 device = labels_input_ids.device
 
                 target_inputs = torch.clone(labels_input_ids)
-                target_inputs[target_inputs == -100] = 0
+                target_inputs[target_inputs == -100] = self.config.pad_token_id
 
                 labels_output = self.roc_bert(
-                    labels_input_ids,
+                    target_inputs,
                     input_shape_ids=labels_input_shape_ids,
                     input_pronunciation_ids=labels_input_pronunciation_ids,
                     attention_mask=labels_attention_mask,
@@ -1335,7 +1335,9 @@ def prepare_inputs_for_generation(
         effective_batch_size = input_shape[0]
 
         #  add a dummy token
-        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
+        if self.config.pad_token_id is None:
+            raise ValueError("The PAD token should be defined for generation")
+
         attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
         dummy_token = torch.full(
             (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
index a91421bad7577..07e740577a06c 100644
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -95,7 +95,7 @@ def whitespace_tokenize(text):
 class RoCBertTokenizer(PreTrainedTokenizer):
     r"""
     Args:
-    Construct a RocBERT tokenizer. Based on WordPiece. This tokenizer inherits from [`PreTrainedTokenizer`] which
+    Construct a RoCBert tokenizer. Based on WordPiece. This tokenizer inherits from [`PreTrainedTokenizer`] which
     contains most of the main methods. Users should refer to this superclass for more information regarding those
     methods.
         vocab_file (`str`):
diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py
index 9f9ea43faf1d5..2a08b891c99c9 100644
--- a/tests/models/roc_bert/test_modeling_roc_bert.py
+++ b/tests/models/roc_bert/test_modeling_roc_bert.py
@@ -691,14 +691,18 @@ class RoCBertModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_masked_lm(self):
         model = RoCBertForMaskedLM.from_pretrained("weiweishi/roc-bert-base-zh")
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
 
-        vocab_size = 21128
+        # input_text: ['[CLS]', 'b', 'a', '里', '系', '[MASK]', '国', '的', '首', '都', '[SEP]'] is the adversarial text
+        # of ['[CLS]', '巴', '黎', '是', '[MASK]', '国', '的', '首', '都', '[SEP]'], means
+        # "Paris is the [MASK] of France" in English
+        input_ids = torch.tensor([[101, 144, 143, 7027, 5143, 103, 1744, 4638, 7674, 6963, 102]])
+        input_shape_ids = torch.tensor([[2, 20324, 23690, 8740, 706, 1, 10900, 23343, 20205, 5850, 2]])
+        input_pronunciation_ids = torch.tensor([[2, 718, 397, 52, 61, 1, 168, 273, 180, 243, 2]])
 
-        expected_shape = torch.Size((1, 6, vocab_size))
-        self.assertEqual(output.shape, expected_shape)
+        output = model(input_ids, input_shape_ids, input_pronunciation_ids)
+        output_ids = torch.argmax(output.logits, dim=2)
 
-        expected_slice = torch.tensor([[[0.6248, 0.3013, 0.3739], [0.3544, 0.8086, 0.2427], [0.3244, 0.6589, 0.1711]]])
+        # convert to tokens is: ['[CLS]', '巴', '*', '黎', '是', '法', '国', '的', '首', '都', '[SEP]']
+        expected_output = torch.tensor([[101, 2349, 115, 7944, 3221, 3791, 1744, 4638, 7674, 6963, 102]])
 
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+        self.assertTrue(output_ids, expected_output)
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 869d3a01045b0..4d6dde9239f26 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -122,6 +122,8 @@ src/transformers/models/resnet/modeling_tf_resnet.py
 src/transformers/models/roberta/configuration_roberta.py
 src/transformers/models/roberta/modeling_roberta.py
 src/transformers/models/roberta/modeling_tf_roberta.py
+src/transformers/models/roc_bert/modeling_roc_bert.py
+src/transformers/models/roc_bert/tokenization_roc_bert.py
 src/transformers/models/segformer/modeling_segformer.py
 src/transformers/models/sew/configuration_sew.py
 src/transformers/models/sew/modeling_sew.py