From faa4d1d158a11ee58d1835f0b0ff75068a4ef686 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Thu, 3 Feb 2022 15:52:29 -0500
Subject: [PATCH 001/121] Add data2vec model cloned from roberta

---
 docs/source/index.mdx                         |    1 +
 docs/source/model_doc/data2vec.mdx            |   69 +
 docs/source/serialization.mdx                 |    1 +
 src/transformers/__init__.py                  |   51 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 src/transformers/models/auto/modeling_auto.py |    9 +
 .../models/auto/tokenization_auto.py          |    1 +
 src/transformers/models/data2vec/__init__.py  |   60 +
 src/transformers/models/data2vec/cheat.py     |   40 +
 .../models/data2vec/configuration_data2vec.py |   74 +
 ..._original_pytorch_checkpoint_to_pytorch.py |  178 ++
 .../models/data2vec/data2vec_text.py          |  519 ++++++
 .../models/data2vec/modeling_data2vec.py      | 1578 +++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |   59 +
 tests/test_modeling_data2vec.py               |  556 ++++++
 16 files changed, 3200 insertions(+)
 create mode 100644 docs/source/model_doc/data2vec.mdx
 create mode 100644 src/transformers/models/data2vec/__init__.py
 create mode 100644 src/transformers/models/data2vec/cheat.py
 create mode 100644 src/transformers/models/data2vec/configuration_data2vec.py
 create mode 100644 src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
 create mode 100644 src/transformers/models/data2vec/data2vec_text.py
 create mode 100644 src/transformers/models/data2vec/modeling_data2vec.py
 create mode 100644 tests/test_modeling_data2vec.py
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 6f97693065c8d..42b3df240bd08 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -177,6 +177,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|          data2vec           |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
new file mode 100644
index 0000000000000..a9fc0caff33f0
--- /dev/null
+++ b/docs/source/model_doc/data2vec.mdx
@@ -0,0 +1,69 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# data2vec
+
+## Overview
+
+The data2vec model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## Data2VecConfig
+
+[[autodoc]] Data2VecConfig
+
+## Data2VecModel
+
+[[autodoc]] Data2VecModel
+    - forward
+
+## Data2VecForCausalLM
+
+[[autodoc]] Data2VecForCausalLM
+    - forward
+
+## Data2VecForMaskedLM
+
+[[autodoc]] Data2VecForMaskedLM
+    - forward
+
+## Data2VecForSequenceClassification
+
+[[autodoc]] Data2VecForSequenceClassification
+    - forward
+
+## Data2VecForMultipleChoice
+
+[[autodoc]] Data2VecForMultipleChoice
+    - forward
+
+## Data2VecForTokenClassification
+
+[[autodoc]] Data2VecForTokenClassification
+    - forward
+
+## Data2VecForQuestionAnswering
+
+[[autodoc]] Data2VecForQuestionAnswering
+    - forward
diff --git a/docs/source/serialization.mdx b/docs/source/serialization.mdx
index 83d291f7672e0..c4dc49e8c9610 100644
--- a/docs/source/serialization.mdx
+++ b/docs/source/serialization.mdx
@@ -49,6 +49,7 @@ Ready-made configurations include the following architectures:
 - BART
 - BERT
 - CamemBERT
+- data2vec
 - DistilBERT
 - GPT Neo
 - I-BERT
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 0a133f71ebd00..697d1dd7b95e9 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -203,6 +203,7 @@
     "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
     "models.cpm": ["CpmTokenizer"],
     "models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
+    "models.data2vec": ["DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecConfig"],
     "models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"],
     "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
     "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
@@ -846,6 +847,19 @@
             "CTRLPreTrainedModel",
         ]
     )
+    _import_structure["models.data2vec"].extend(
+        [
+            "DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "Data2VecForCausalLM",
+            "Data2VecForMaskedLM",
+            "Data2VecForMultipleChoice",
+            "Data2VecForQuestionAnswering",
+            "Data2VecForSequenceClassification",
+            "Data2VecForTokenClassification",
+            "Data2VecModel",
+            "Data2VecPreTrainedModel",
+        ]
+    )
     _import_structure["models.deberta"].extend(
         [
             "DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1695,6 +1709,20 @@
             "TFCTRLPreTrainedModel",
         ]
     )
+    _import_structure["models.data2vec"].extend(
+        [
+            "TF_DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFData2VecForCausalLM",
+            "TFData2VecForMaskedLM",
+            "TFData2VecForMultipleChoice",
+            "TFData2VecForQuestionAnswering",
+            "TFData2VecForSequenceClassification",
+            "TFData2VecForTokenClassification",
+            "TFData2VecMainLayer",
+            "TFData2VecModel",
+            "TFData2VecPreTrainedModel",
+        ]
+    )
     _import_structure["models.deberta"].extend(
         [
             "TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2147,6 +2175,17 @@
             "FlaxCLIPVisionPreTrainedModel",
         ]
     )
+    _import_structure["models.data2vec"].extend(
+        [
+            "FlaxData2VecForMaskedLM",
+            "FlaxData2VecForMultipleChoice",
+            "FlaxData2VecForQuestionAnswering",
+            "FlaxData2VecForSequenceClassification",
+            "FlaxData2VecForTokenClassification",
+            "FlaxData2VecModel",
+            "FlaxData2VecPreTrainedModel",
+        ]
+    )
     _import_structure["models.distilbert"].extend(
         [
             "FlaxDistilBertForMaskedLM",
@@ -2393,6 +2432,7 @@
     from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
     from .models.cpm import CpmTokenizer
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
+    from .models.data2vec import DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecConfig
     from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer
     from .models.deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
@@ -2928,6 +2968,17 @@
             CTRLModel,
             CTRLPreTrainedModel,
         )
+        from .models.data2vec import (
+            DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Data2VecForCausalLM,
+            Data2VecForMaskedLM,
+            Data2VecForMultipleChoice,
+            Data2VecForQuestionAnswering,
+            Data2VecForSequenceClassification,
+            Data2VecForTokenClassification,
+            Data2VecModel,
+            Data2VecPreTrainedModel,
+        )
         from .models.deberta import (
             DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
             DebertaForMaskedLM,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index baf2b11ab7145..e7e78ec905795 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -39,6 +39,7 @@
     convbert,
     cpm,
     ctrl,
+    data2vec,
     deberta,
     deberta_v2,
     deit,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index ebd83c1be11cb..4acb20c3326f3 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -88,6 +88,7 @@
         ("reformer", "ReformerConfig"),
         ("longformer", "LongformerConfig"),
         ("roberta", "RobertaConfig"),
+        ("data2vec", "Data2VecConfig"),
         ("deberta-v2", "DebertaV2Config"),
         ("deberta", "DebertaConfig"),
         ("flaubert", "FlaubertConfig"),
@@ -171,6 +172,7 @@
         ("xlnet", "XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xlm", "XLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("roberta", "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("data2vec", "DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("distilbert", "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -261,6 +263,7 @@
         ("reformer", "Reformer"),
         ("longformer", "Longformer"),
         ("roberta", "RoBERTa"),
+        ("data2vec", "data2vec"),
         ("flaubert", "FlauBERT"),
         ("fsmt", "FairSeq Machine-Translation"),
         ("squeezebert", "SqueezeBERT"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 94909cfde50b4..be5e3987a7b89 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -80,6 +80,7 @@
         ("bart", "BartModel"),
         ("longformer", "LongformerModel"),
         ("roberta", "RobertaModel"),
+        ("data2vec", "Data2VecModel"),
         ("layoutlm", "LayoutLMModel"),
         ("squeezebert", "SqueezeBertModel"),
         ("bert", "BertModel"),
@@ -130,6 +131,7 @@
         ("fsmt", "FSMTForConditionalGeneration"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
+        ("data2vec", "Data2VecForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForPreTraining"),
         ("big_bird", "BigBirdForPreTraining"),
@@ -187,6 +189,7 @@
         ("bart", "BartForConditionalGeneration"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
+        ("data2vec", "Data2VecForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForMaskedLM"),
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
@@ -226,6 +229,7 @@
         ("xlm-roberta-xl", "XLMRobertaXLForCausalLM"),
         ("xlm-roberta", "XLMRobertaForCausalLM"),
         ("roberta", "RobertaForCausalLM"),
+        ("data2vec", "Data2VecForCausalLM"),
         ("bert", "BertLMHeadModel"),
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
         ("gpt2", "GPT2LMHeadModel"),
@@ -312,6 +316,7 @@
         ("xlm-roberta", "XLMRobertaForMaskedLM"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
+        ("data2vec", "Data2VecForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForMaskedLM"),
         ("megatron-bert", "MegatronBertForMaskedLM"),
@@ -390,6 +395,7 @@
         ("bart", "BartForSequenceClassification"),
         ("longformer", "LongformerForSequenceClassification"),
         ("roberta", "RobertaForSequenceClassification"),
+        ("data2vec", "Data2VecForSequenceClassification"),
         ("squeezebert", "SqueezeBertForSequenceClassification"),
         ("layoutlm", "LayoutLMForSequenceClassification"),
         ("bert", "BertForSequenceClassification"),
@@ -439,6 +445,7 @@
         ("xlm-roberta-xl", "XLMRobertaXLForQuestionAnswering"),
         ("xlm-roberta", "XLMRobertaForQuestionAnswering"),
         ("roberta", "RobertaForQuestionAnswering"),
+        ("data2vec", "Data2VecForQuestionAnswering"),
         ("squeezebert", "SqueezeBertForQuestionAnswering"),
         ("bert", "BertForQuestionAnswering"),
         ("xlnet", "XLNetForQuestionAnsweringSimple"),
@@ -487,6 +494,7 @@
         ("xlm-roberta", "XLMRobertaForTokenClassification"),
         ("longformer", "LongformerForTokenClassification"),
         ("roberta", "RobertaForTokenClassification"),
+        ("data2vec", "Data2VecForTokenClassification"),
         ("squeezebert", "SqueezeBertForTokenClassification"),
         ("bert", "BertForTokenClassification"),
         ("megatron-bert", "MegatronBertForTokenClassification"),
@@ -521,6 +529,7 @@
         ("xlm-roberta", "XLMRobertaForMultipleChoice"),
         ("longformer", "LongformerForMultipleChoice"),
         ("roberta", "RobertaForMultipleChoice"),
+        ("data2vec", "Data2VecForMultipleChoice"),
         ("squeezebert", "SqueezeBertForMultipleChoice"),
         ("bert", "BertForMultipleChoice"),
         ("distilbert", "DistilBertForMultipleChoice"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index d5dfff132d7c4..9a14ce8124723 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -107,6 +107,7 @@
             ("bart", ("BartTokenizer", "BartTokenizerFast")),
             ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
             ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
+            ("data2vec", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "reformer",
                 (
diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
new file mode 100644
index 0000000000000..63b7945d90f38
--- /dev/null
+++ b/src/transformers/models/data2vec/__init__.py
@@ -0,0 +1,60 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...file_utils import _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_data2vec": ["DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecConfig", "Data2VecOnnxConfig"],
+}
+
+if is_torch_available():
+    _import_structure["modeling_data2vec"] = [
+        "DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Data2VecForCausalLM",
+        "Data2VecForMaskedLM",
+        "Data2VecForMultipleChoice",
+        "Data2VecForQuestionAnswering",
+        "Data2VecForSequenceClassification",
+        "Data2VecForTokenClassification",
+        "Data2VecModel",
+        "Data2VecPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_data2vec import DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecConfig, Data2VecOnnxConfig
+
+    if is_torch_available():
+        from .modeling_data2vec import (
+            DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Data2VecForCausalLM,
+            Data2VecForMaskedLM,
+            Data2VecForMultipleChoice,
+            Data2VecForQuestionAnswering,
+            Data2VecForSequenceClassification,
+            Data2VecForTokenClassification,
+            Data2VecModel,
+            Data2VecPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/data2vec/cheat.py b/src/transformers/models/data2vec/cheat.py
new file mode 100644
index 0000000000000..b043bef5a3547
--- /dev/null
+++ b/src/transformers/models/data2vec/cheat.py
@@ -0,0 +1,40 @@
+import os
+import tarfile
+import urllib.request
+
+# Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
+from .data2vec_text import Data2VecTextModel
+
+
+class Data2VecFairseqProxy():
+    def __init__(self, module):
+        self.module = module
+
+    @classmethod
+    def from_pretrained(cls, mname):
+        ckpt = f"{mname}.pt"
+        cls._download_weights(model=ckpt)
+        return cls(Data2VecTextModel.from_pretrained("roberta/roberta.large", checkpoint_file=ckpt))
+    
+    @staticmethod
+    def _download_weights(model: str="nlp_base.pt"):
+        assert model in ("nlp_base.pt", "audio_base_ls.pt"), "Weights not found"
+        root_url = "https://dl.fbaipublicfiles.com/fairseq"
+
+        if model == "nlp_base.pt":
+            # Need download RoBERTa first to get the dictionary file
+            if not os.path.isdir("roberta"):
+                print("Downloading roberta")
+                urllib.request.urlretrieve(f"{root_url}/models/roberta.large.tar.gz", "roberta.large.tar.gz")
+                with tarfile.open("roberta.large.tar.gz") as f:
+                    f.extractall("roberta")
+                # Remove Roberta model weights and tar file
+                os.remove(os.path.join("roberta", "roberta.large", "model.pt"))
+                os.remove(os.path.join("roberta.large.tar.gz"))
+
+        # Then download the actual data2vec weights
+        model_url = f"{root_url}/data2vec/{model}"
+        model_path = os.path.join("roberta", "roberta.large", model)
+        if not os.path.isfile(model_path):
+            print("Downloading model...")
+            urllib.request.urlretrieve(model_url, model_path)
diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
new file mode 100644
index 0000000000000..43a80373ae748
--- /dev/null
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -0,0 +1,74 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" data2vec configuration"""
+from collections import OrderedDict
+from typing import Mapping
+
+from ...onnx import OnnxConfig
+from ...utils import logging
+from ..bert.configuration_bert import BertConfig
+
+
+logger = logging.get_logger(__name__)
+
+DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "data2vec": "https://huggingface.co/data2vec/resolve/main/config.json",
+}
+
+
+
+class Data2VecConfig(BertConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Data2VecModel`] or a [`TFData2VecModel`]. It is
+    used to instantiate a data2vec model according to the specified arguments, defining the model architecture.
+
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    The [`Data2VecConfig`] class directly inherits [`BertConfig`]. It reuses the same defaults. Please check the parent
+    class for more information.
+
+    Examples:
+
+    ```python
+    >>> from transformers import Data2VecConfig, Data2VecModel
+
+    >>> # Initializing a data2vec configuration
+    >>> configuration = Data2VecConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = Data2VecModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "data2vec"
+
+    def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
+        """Constructs Data2VecConfig."""
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+
+class Data2VecOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
diff --git a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000..d8c13ff0a1b40
--- /dev/null
+++ b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,178 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert data2vec checkpoint."""
+
+
+import argparse
+import pathlib
+
+import fairseq
+import torch
+from fairseq.models.data2vec import Data2VecModel as FairseqData2VecModel
+from fairseq.modules import TransformerSentenceEncoderLayer
+from packaging import version
+
+from transformers import Data2VecConfig, Data2VecForMaskedLM, Data2VecForSequenceClassification
+from transformers.models.bert.modeling_bert import (
+    BertIntermediate,
+    BertLayer,
+    BertOutput,
+    BertSelfAttention,
+    BertSelfOutput,
+)
+from transformers.utils import logging
+
+
+if version.parse(fairseq.__version__) < version.parse("0.9.0"):
+    raise Exception("requires fairseq >= 0.9.0")
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+SAMPLE_TEXT = "Hello world! cécé herlolip"
+
+
+def convert_data2vec_checkpoint_to_pytorch(
+    data2vec_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
+):
+    """
+    Copy/paste/tweak data2vec's weights to our BERT structure.
+    """
+    data2vec = FairseqData2VecModel.from_pretrained(data2vec_checkpoint_path)
+    data2vec.eval()  # disable dropout
+    data2vec_sent_encoder = data2vec.model.encoder.sentence_encoder
+    config = Data2VecConfig(
+        vocab_size=data2vec_sent_encoder.embed_tokens.num_embeddings,
+        hidden_size=data2vec.args.encoder_embed_dim,
+        num_hidden_layers=data2vec.args.encoder_layers,
+        num_attention_heads=data2vec.args.encoder_attention_heads,
+        intermediate_size=data2vec.args.encoder_ffn_embed_dim,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
+    )
+    if classification_head:
+        config.num_labels = data2vec.model.classification_heads["mnli"].out_proj.weight.shape[0]
+    print("Our BERT config:", config)
+
+    model = Data2VecForSequenceClassification(config) if classification_head else Data2VecForMaskedLM(config)
+    model.eval()
+
+    # Now let's copy all the weights.
+    # Embeddings
+    model.data2vec.embeddings.word_embeddings.weight = data2vec_sent_encoder.embed_tokens.weight
+    model.data2vec.embeddings.position_embeddings.weight = data2vec_sent_encoder.embed_positions.weight
+    model.data2vec.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
+        model.data2vec.embeddings.token_type_embeddings.weight
+    )  # just zero them out b/c data2vec doesn't use them.
+    model.data2vec.embeddings.LayerNorm.weight = data2vec_sent_encoder.emb_layer_norm.weight
+    model.data2vec.embeddings.LayerNorm.bias = data2vec_sent_encoder.emb_layer_norm.bias
+
+    for i in range(config.num_hidden_layers):
+        # Encoder: start of layer
+        layer: BertLayer = model.data2vec.encoder.layer[i]
+        data2vec_layer: TransformerSentenceEncoderLayer = data2vec_sent_encoder.layers[i]
+
+        # self attention
+        self_attn: BertSelfAttention = layer.attention.self
+        assert (
+            data2vec_layer.self_attn.k_proj.weight.data.shape
+            == data2vec_layer.self_attn.q_proj.weight.data.shape
+            == data2vec_layer.self_attn.v_proj.weight.data.shape
+            == torch.Size((config.hidden_size, config.hidden_size))
+        )
+
+        self_attn.query.weight.data = data2vec_layer.self_attn.q_proj.weight
+        self_attn.query.bias.data = data2vec_layer.self_attn.q_proj.bias
+        self_attn.key.weight.data = data2vec_layer.self_attn.k_proj.weight
+        self_attn.key.bias.data = data2vec_layer.self_attn.k_proj.bias
+        self_attn.value.weight.data = data2vec_layer.self_attn.v_proj.weight
+        self_attn.value.bias.data = data2vec_layer.self_attn.v_proj.bias
+
+        # self-attention output
+        self_output: BertSelfOutput = layer.attention.output
+        assert self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape
+        self_output.dense.weight = data2vec_layer.self_attn.out_proj.weight
+        self_output.dense.bias = data2vec_layer.self_attn.out_proj.bias
+        self_output.LayerNorm.weight = data2vec_layer.self_attn_layer_norm.weight
+        self_output.LayerNorm.bias = data2vec_layer.self_attn_layer_norm.bias
+
+        # intermediate
+        intermediate: BertIntermediate = layer.intermediate
+        assert intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape
+        intermediate.dense.weight = data2vec_layer.fc1.weight
+        intermediate.dense.bias = data2vec_layer.fc1.bias
+
+        # output
+        bert_output: BertOutput = layer.output
+        assert bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape
+        bert_output.dense.weight = data2vec_layer.fc2.weight
+        bert_output.dense.bias = data2vec_layer.fc2.bias
+        bert_output.LayerNorm.weight = data2vec_layer.final_layer_norm.weight
+        bert_output.LayerNorm.bias = data2vec_layer.final_layer_norm.bias
+        # end of layer
+
+    if classification_head:
+        model.classifier.dense.weight = data2vec.model.classification_heads["mnli"].dense.weight
+        model.classifier.dense.bias = data2vec.model.classification_heads["mnli"].dense.bias
+        model.classifier.out_proj.weight = data2vec.model.classification_heads["mnli"].out_proj.weight
+        model.classifier.out_proj.bias = data2vec.model.classification_heads["mnli"].out_proj.bias
+    else:
+        # LM Head
+        model.lm_head.dense.weight = data2vec.model.encoder.lm_head.dense.weight
+        model.lm_head.dense.bias = data2vec.model.encoder.lm_head.dense.bias
+        model.lm_head.layer_norm.weight = data2vec.model.encoder.lm_head.layer_norm.weight
+        model.lm_head.layer_norm.bias = data2vec.model.encoder.lm_head.layer_norm.bias
+        model.lm_head.decoder.weight = data2vec.model.encoder.lm_head.weight
+        model.lm_head.decoder.bias = data2vec.model.encoder.lm_head.bias
+
+    # Let's check that we get the same results.
+    input_ids: torch.Tensor = data2vec.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
+
+    our_output = model(input_ids)[0]
+    if classification_head:
+        their_output = data2vec.model.classification_heads["mnli"](data2vec.extract_features(input_ids))
+    else:
+        their_output = data2vec.model(input_ids)[0]
+    print(our_output.shape, their_output.shape)
+    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
+    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
+    success = torch.allclose(our_output, their_output, atol=1e-3)
+    print("Do both models output the same tensors?", "🔥" if success else "💩")
+    if not success:
+        raise Exception("Something went wRoNg")
+
+    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--data2vec_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--classification_head", action="store_true", help="Whether to convert a final classification head."
+    )
+    args = parser.parse_args()
+    convert_data2vec_checkpoint_to_pytorch(
+        args.data2vec_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
+    )
diff --git a/src/transformers/models/data2vec/data2vec_text.py b/src/transformers/models/data2vec/data2vec_text.py
new file mode 100644
index 0000000000000..5cc1d37e842aa
--- /dev/null
+++ b/src/transformers/models/data2vec/data2vec_text.py
@@ -0,0 +1,519 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+from typing import Optional
+import logging
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from omegaconf import II
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.configs import EMAConfig
+from fairseq.models.ema import EMA
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderModel,
+    register_model,
+)
+from fairseq.models.roberta.model import RobertaLMHead, RobertaClassificationHead
+from fairseq.models.transformer import TransformerEncoder, TransformerConfig
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Data2VecTextConfig(FairseqDataclass):
+    max_positions: int = II("task.tokens_per_sample")
+
+    head_layers: int = 1
+
+    transformer: TransformerConfig = TransformerConfig()
+
+    load_checkpoint_heads: bool = field(
+        default=False,
+        metadata={"help": "(re-)register and load heads when loading checkpoints"},
+    )
+
+    loss_beta: float = field(
+        default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"}
+    )
+    loss_scale: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"
+        },
+    )
+    average_top_k_layers: int = field(
+        default=8, metadata={"help": "how many layers to average"}
+    )
+
+    layer_norm_target_layer: bool = False
+    instance_norm_target_layer: bool = False
+    batch_norm_target_layer: bool = False
+    instance_norm_targets: bool = False
+    layer_norm_targets: bool = False
+
+    ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"})
+    ema_end_decay: float = field(
+        default=0.9999, metadata={"help": "final ema decay rate"}
+    )
+
+    # when to finish annealing ema decay rate
+    ema_anneal_end_step: int = II("optimization.max_update")
+
+    ema_transformer_layers_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer layers"},
+    )
+
+
+def get_annealed_rate(start, end, curr_step, total_steps):
+    r = end - start
+    pct_remaining = 1 - curr_step / total_steps
+    return end - r * pct_remaining
+
+
+@register_model("data2vec_text", dataclass=Data2VecTextConfig)
+class Data2VecTextModel(FairseqEncoderModel):
+    def __init__(self, cfg: Data2VecTextConfig, encoder):
+        super().__init__(encoder)
+        self.cfg = cfg
+
+        # We follow BERT's random weight initialization
+        self.apply(init_bert_params)
+
+        self.classification_heads = nn.ModuleDict()
+
+    @classmethod
+    def build_model(cls, cfg, task):
+        """Build a new model instance."""
+
+        encoder = Data2VecTextEncoder(cfg, task.source_dictionary, task.cfg.data)
+
+        return cls(cfg, encoder)
+
+    def forward(
+        self,
+        src_tokens,
+        target_tokens=None,
+        features_only=False,
+        return_all_hiddens=False,
+        classification_head_name=None,
+        **kwargs,
+    ):
+        if classification_head_name is not None:
+            features_only = True
+
+        res = self.encoder(
+            src_tokens, target_tokens, features_only, return_all_hiddens, **kwargs
+        )
+
+        if isinstance(res, tuple):
+            x, extra = res
+        else:
+            return res
+
+        if classification_head_name is not None:
+            x = self.classification_heads[classification_head_name](x)
+        return x, extra
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        logits = net_output[0].float()
+        if log_probs:
+            return F.log_softmax(logits, dim=-1)
+        else:
+            return F.softmax(logits, dim=-1)
+
+    def register_classification_head(
+        self, name, num_classes=None, inner_dim=None, **kwargs
+    ):
+        """Register a classification head."""
+        if name in self.classification_heads:
+            prev_num_classes = self.classification_heads[name].out_proj.out_features
+            prev_inner_dim = self.classification_heads[name].dense.out_features
+            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
+                logger.warning(
+                    're-registering head "{}" with num_classes {} (prev: {}) '
+                    "and inner_dim {} (prev: {})".format(
+                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
+                    )
+                )
+        self.classification_heads[name] = RobertaClassificationHead(
+            input_dim=self.cfg.transformer.encoder.embed_dim,
+            inner_dim=inner_dim or self.cfg.transformer.encoder.embed_dim,
+            num_classes=num_classes,
+            activation_fn="tanh",
+            pooler_dropout=0,
+        )
+
+    @property
+    def supported_targets(self):
+        return {"self"}
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+
+        # rename decoder -> encoder before upgrading children modules
+        for k in list(state_dict.keys()):
+            if k.startswith(prefix + "decoder"):
+                new_k = prefix + "encoder" + k[len(prefix + "decoder") :]
+                state_dict[new_k] = state_dict[k]
+                del state_dict[k]
+
+        # rename emb_layer_norm -> layernorm_embedding
+        for k in list(state_dict.keys()):
+            if ".emb_layer_norm." in k:
+                new_k = k.replace(".emb_layer_norm.", ".layernorm_embedding.")
+                state_dict[new_k] = state_dict[k]
+                del state_dict[k]
+
+            if self.encoder.regression_head is not None:
+                if ".lm_head." in k:
+                    new_k = k.replace(".lm_head.", ".regression_head.")
+                    state_dict[new_k] = state_dict[k]
+                    del state_dict[k]
+            else:
+                if ".regression_head." in k:
+                    del state_dict[k]
+
+        # upgrade children modules
+        super().upgrade_state_dict_named(state_dict, name)
+
+        # Handle new classification heads present in the state dict.
+        current_head_names = (
+            []
+            if not hasattr(self, "classification_heads")
+            or self.classification_heads is None
+            else self.classification_heads.keys()
+        )
+        keys_to_delete = []
+        for k in state_dict.keys():
+            if not k.startswith(prefix + "classification_heads."):
+                continue
+
+            head_name = k[len(prefix + "classification_heads.") :].split(".")[0]
+            num_classes = state_dict[
+                prefix + "classification_heads." + head_name + ".out_proj.weight"
+            ].size(0)
+            inner_dim = state_dict[
+                prefix + "classification_heads." + head_name + ".dense.weight"
+            ].size(0)
+
+            if self.cfg.load_checkpoint_heads:
+                if head_name not in current_head_names:
+                    self.register_classification_head(head_name, num_classes, inner_dim)
+            else:
+                if head_name not in current_head_names:
+                    logger.warning(
+                        "deleting classification head ({}) from checkpoint "
+                        "not present in current model: {}".format(head_name, k)
+                    )
+                    keys_to_delete.append(k)
+                elif (
+                    num_classes
+                    != self.classification_heads[head_name].out_proj.out_features
+                    or inner_dim
+                    != self.classification_heads[head_name].dense.out_features
+                ):
+                    logger.warning(
+                        "deleting classification head ({}) from checkpoint "
+                        "with different dimensions than current model: {}".format(
+                            head_name, k
+                        )
+                    )
+                    keys_to_delete.append(k)
+        for k in keys_to_delete:
+            del state_dict[k]
+
+        # Copy any newly-added classification heads into the state dict
+        # with their current weights.
+        if (
+            hasattr(self, "classification_heads")
+            and self.classification_heads is not None
+            and len(self.classification_heads) > 0
+        ):
+            cur_state = self.classification_heads.state_dict()
+            for k, v in cur_state.items():
+                if prefix + "classification_heads." + k not in state_dict:
+                    logger.info("Overwriting " + prefix + "classification_heads." + k)
+                    state_dict[prefix + "classification_heads." + k] = v
+
+            for k in list(state_dict.keys()):
+                if k.startswith(prefix + "encoder.lm_head.") or k.startswith(
+                    prefix + "encoder.emb_head."
+                ):
+                    del state_dict[k]
+
+            self.encoder.lm_head = None
+
+        if self.encoder.target_model is None:
+            for k in list(state_dict.keys()):
+                if k.startswith(prefix + "encoder.target_model."):
+                    del state_dict[k]
+
+        if (self.encoder.ema is None) and (prefix + "encoder._ema" in state_dict):
+            del state_dict[prefix + "encoder._ema"]
+
+    def remove_pretraining_modules(self, last_layer=None):
+        self.encoder.lm_head = None
+        self.encoder.regression_head = None
+        self.encoder.ema = None
+        self.classification_heads = None
+
+        if last_layer is not None:
+            self.encoder.sentence_encoder.layers = nn.ModuleList(
+                l
+                for i, l in enumerate(self.encoder.sentence_encoder.layers)
+                if i <= last_layer
+            )
+            self.encoder.sentence_encoder.layer_norm = None
+
+
+class Data2VecTextEncoder(FairseqEncoder):
+    def __init__(self, cfg: Data2VecTextConfig, dictionary, task_data):
+        super().__init__(dictionary)
+
+        # HACK
+        self.target_model = None
+
+        self.cfg = cfg
+
+        embed_tokens = self.build_embedding(
+            len(dictionary), cfg.transformer.encoder.embed_dim, dictionary.pad()
+        )
+
+        self.sentence_encoder = self.build_encoder(cfg, dictionary, embed_tokens)
+        self.mask_idx = dictionary.index("<mask>")
+        assert self.mask_idx != dictionary.unk(), dictionary.symbols
+
+        self.ema = None
+        self.average_top_k_layers = cfg.average_top_k_layers
+        self.loss_scale = cfg.loss_scale
+
+        assert self.cfg.head_layers >= 1
+
+        embed_dim = cfg.transformer.encoder.embed_dim
+        curr_dim = embed_dim
+        projs = []
+        for i in range(self.cfg.head_layers - 1):
+            next_dim = embed_dim * 2 if i == 0 else curr_dim
+            projs.append(nn.Linear(curr_dim, next_dim))
+            projs.append(nn.GELU())
+            curr_dim = next_dim
+
+        projs.append(nn.Linear(curr_dim, embed_dim))
+        self.regression_head = nn.Sequential(*projs)
+
+        self.num_updates = 0
+
+    def build_embedding(self, vocab_size, embedding_dim, padding_idx):
+        return nn.Embedding(vocab_size, embedding_dim, padding_idx)
+
+    def build_encoder(self, cfg, dictionary, embed_tokens):
+        encoder = TransformerEncoder(cfg.transformer, dictionary, embed_tokens, return_fc=True)
+        encoder.apply(init_bert_params)
+        return encoder
+
+    def build_lm_head(self, embed_dim, output_dim, activation_fn, weight):
+        return RobertaLMHead(embed_dim, output_dim, activation_fn, weight)
+
+    def make_ema_teacher(self):
+        ema_config = EMAConfig(
+            ema_decay=self.cfg.ema_decay,
+            ema_fp32=True,
+        )
+        skip_keys = set()
+        if self.cfg.ema_transformer_layers_only:
+            for k, _ in self.sentence_encoder.embed_positions.named_parameters():
+                skip_keys.add(f"embed_tokens.{k}")
+            for k, _ in self.sentence_encoder.embed_positions.named_parameters():
+                skip_keys.add(f"embed_positions.{k}")
+            if self.sentence_encoder.layernorm_embedding is not None:
+                for (
+                    k,
+                    _,
+                ) in self.sentence_encoder.layernorm_embedding.named_parameters():
+                    skip_keys.add(f"layernorm_embedding.{k}")
+            if self.sentence_encoder.layer_norm is not None:
+                for k, _ in self.sentence_encoder.layer_norm.named_parameters():
+                    skip_keys.add(f"layernorm_embedding.{k}")
+
+        self.ema = EMA(
+            self.sentence_encoder,
+            ema_config,
+            skip_keys=skip_keys,
+        )
+
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+
+        if self.ema is None and self.regression_head is not None:
+            logger.info(f"making ema teacher")
+            self.make_ema_teacher()
+        elif self.training and self.ema is not None:
+            if self.cfg.ema_decay != self.cfg.ema_end_decay:
+                if num_updates >= self.cfg.ema_anneal_end_step:
+                    decay = self.cfg.ema_end_decay
+                else:
+                    decay = get_annealed_rate(
+                        self.cfg.ema_decay,
+                        self.cfg.ema_end_decay,
+                        num_updates,
+                        self.cfg.ema_anneal_end_step,
+                    )
+                self.ema._set_decay(decay)
+            if self.ema.get_decay() < 1:
+                self.ema.step(self.sentence_encoder)
+
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        state = super().state_dict(destination, prefix, keep_vars)
+        if self.ema is not None:
+            state[prefix + "_ema"] = self.ema.fp32_params
+        return state
+
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        if self.ema is not None:
+            k = prefix + "_ema"
+            assert k in state_dict
+            self.ema.restore(state_dict[k], True)
+            del state_dict[k]
+        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+    def forward(
+        self,
+        src_tokens,
+        target_tokens=None,
+        features_only=False,
+        return_all_hiddens=False,
+        masked_tokens=None,
+        **unused,
+    ):
+        """
+        Args:
+            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
+            features_only (bool, optional): skip LM head and just return
+                features. If True, the output will be of shape `(batch, src_len, embed_dim)`.
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+
+        Returns:
+            tuple:
+                - the LM output of shape `(batch, src_len, vocab)`
+                - a dictionary of additional data, where 'inner_states' is a list of hidden states. Note that the
+                  hidden states have shape `(src_len, batch, vocab)`.
+        """
+
+        x, extra = self.extract_features(
+            src_tokens, return_all_hiddens=return_all_hiddens
+        )
+
+        if features_only:
+            return x, extra
+
+        assert target_tokens is not None
+
+        with torch.no_grad():
+            # use EMA parameter as the teacher
+            self.ema.model.eval()
+
+            encoder_out = self.ema.model(
+                target_tokens,
+                return_all_hiddens=True,
+            )
+            y = encoder_out["fc_results"]
+
+            y = y[-self.average_top_k_layers :]
+
+            permuted = False
+            if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer:
+                y = [tl.permute(1, 2, 0) for tl in y]  # TBC -> BCT
+                permuted = True
+
+            if self.cfg.batch_norm_target_layer:
+                y = [
+                    F.batch_norm(
+                        tl.float(), running_mean=None, running_var=None, training=True
+                    )
+                    for tl in y
+                ]
+
+            if self.cfg.instance_norm_target_layer:
+                y = [F.instance_norm(tl.float()) for tl in y]
+
+            if permuted:
+                y = [tl.transpose(1, 2) for tl in y]  # BCT -> BTC
+
+            if self.cfg.layer_norm_target_layer:
+                y = [F.layer_norm(tl.float(), tl.shape[-1:]) for tl in y]
+
+            y = sum(y) / len(y)
+
+            if not permuted:
+                y = y.transpose(0, 1)
+
+            if self.cfg.layer_norm_targets:
+                y = F.layer_norm(y.float(), y.shape[-1:])
+
+            if self.cfg.instance_norm_targets:
+                y = F.instance_norm(y.transpose(1, 2)).transpose(1, 2)
+
+        masked_indices = src_tokens.eq(self.mask_idx)
+
+        x = x[masked_indices]
+        y = y[masked_indices]
+
+        x = self.regression_head(x)
+
+        sz = x.size(-1)
+        if self.cfg.loss_beta == 0:
+            loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1)
+        else:
+            loss = F.smooth_l1_loss(
+                x.float(), y.float(), reduction="none", beta=self.cfg.loss_beta
+            ).sum(dim=-1)
+
+        result = {
+            "losses": {
+                "main": loss.sum() / math.sqrt(sz)
+                if self.loss_scale <= 0
+                else loss.sum() * self.loss_scale,
+            },
+            "sample_size": loss.numel(),
+        }
+
+        # logging other values
+        other_logs = {
+            "ema_decay": self.ema.get_decay() * 1000
+        }
+        result["logs"] = other_logs
+        return result
+
+    def extract_features(self, src_tokens, return_all_hiddens=False, **kwargs):
+        encoder_out = self.sentence_encoder(
+            src_tokens,
+            return_all_hiddens=return_all_hiddens,
+            token_embeddings=kwargs.get("token_embeddings", None),
+        )
+        # T x B x C -> B x T x C
+        features = encoder_out["encoder_out"][0].transpose(0, 1)
+        inner_states = encoder_out["encoder_states"] if return_all_hiddens else None
+        return features, {
+            "inner_states": inner_states,
+            "encoder_embedding": encoder_out["encoder_embedding"][0],
+        }
+
+    def output_layer(self, features, masked_tokens=None, **unused):
+        return self.lm_head(features, masked_tokens)
+
+    def max_positions(self):
+        """Maximum output length supported by the encoder."""
+        return self.cfg.max_positions
diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
new file mode 100644
index 0000000000000..85cb88f510a66
--- /dev/null
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -0,0 +1,1578 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch data2vec model."""
+
+import math
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN, gelu
+from ...file_utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from ...utils import logging
+from .configuration_data2vec import Data2VecConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "data2vec"
+_CONFIG_FOR_DOC = "Data2VecConfig"
+_TOKENIZER_FOR_DOC = "RobertaTokenizer"
+
+DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "data2vec",
+    # See all data2vec models at https://huggingface.co/models?filter=data2vec
+]
+
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Data2Vec
+class Data2VecEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse("1.6.0"):
+            self.register_buffer(
+                "token_type_ids",
+                torch.zeros(self.position_ids.size(), dtype=torch.long),
+                persistent=False,
+            )
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Data2Vec
+class Data2VecSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in Data2VecModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class Data2VecSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Data2Vec
+class Data2VecAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = Data2VecSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = Data2VecSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class Data2VecIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class Data2VecOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Data2Vec
+class Data2VecLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = Data2VecAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = Data2VecAttention(config, position_embedding_type="absolute")
+        self.intermediate = Data2VecIntermediate(config)
+        self.output = Data2VecOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Data2Vec
+class Data2VecEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([Data2VecLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class Data2VecPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->Data2Vec,roberta->data2vec
+class Data2VecPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Data2VecConfig
+    base_model_prefix = "data2vec"
+    supports_gradient_checkpointing = True
+
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, Data2VecEncoder):
+            module.gradient_checkpointing = value
+
+    def update_keys_to_ignore(self, config, del_keys_to_ignore):
+        """Remove some keys from ignore list"""
+        if not config.tie_word_embeddings:
+            # must make a new list, or the class variable gets modified!
+            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
+            self._keys_to_ignore_on_load_missing = [
+                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
+            ]
+
+
+DATA2VEC_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Data2VecConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DATA2VEC_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`RobertaTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare data2vec Model transformer outputting raw hidden-states without any specific head on top.",
+    DATA2VEC_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaModel with ROBERTA->DATA2VEC,Roberta->Data2Vec
+class Data2VecModel(Data2VecPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
+    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
+
+    """
+
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Data2Vec
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = Data2VecEmbeddings(config)
+        self.encoder = Data2VecEncoder(config)
+
+        self.pooler = Data2VecPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """data2vec Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VEC_START_DOCSTRING
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec,roberta-base->data2vec
+class Data2VecForCausalLM(Data2VecPreTrainedModel):
+    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `Data2VecLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.data2vec = Data2VecModel(config, add_pooling_layer=False)
+        self.lm_head = Data2VecLMHead(config)
+
+        # The LM head weights require special treatment only when they are tied with the word embeddings
+        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import Data2VecTokenizer, Data2VecForCausalLM, Data2VecConfig
+        >>> import torch
+
+        >>> tokenizer = Data2VecTokenizer.from_pretrained("data2vec-base")
+        >>> config = Data2VecConfig.from_pretrained("data2vec-base")
+        >>> config.is_decoder = True
+        >>> model = Data2VecForCausalLM.from_pretrained("data2vec-base", config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.data2vec(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past}
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+@add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VEC_START_DOCSTRING)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec
+class Data2VecForMaskedLM(Data2VecPreTrainedModel):
+    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `Data2VecForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.data2vec = Data2VecModel(config, add_pooling_layer=False)
+        self.lm_head = Data2VecLMHead(config)
+
+        # The LM head weights require special treatment only when they are tied with the word embeddings
+        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<mask>",
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+            Used to hide legacy arguments that have been deprecated.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.data2vec(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->Data2Vec
+class Data2VecLMHead(nn.Module):
+    """Data2Vec Head for masked language modeling."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.decoder.bias = self.bias
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x)
+
+        return x
+
+    def _tie_weights(self):
+        # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+        self.bias = self.decoder.bias
+
+
+@add_start_docstrings(
+    """
+    data2vec Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    DATA2VEC_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec
+class Data2VecForSequenceClassification(Data2VecPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.data2vec = Data2VecModel(config, add_pooling_layer=False)
+        self.classifier = Data2VecClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.data2vec(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Data2Vec Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    DATA2VEC_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec
+class Data2VecForMultipleChoice(Data2VecPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.data2vec = Data2VecModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        DATA2VEC_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        labels=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.data2vec(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Data2Vec Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    DATA2VEC_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec
+class Data2VecForTokenClassification(Data2VecPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.data2vec = Data2VecModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.data2vec(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->Data2Vec
+class Data2VecClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings(
+    """
+    Data2Vec Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    DATA2VEC_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec
+class Data2VecForQuestionAnswering(Data2VecPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.data2vec = Data2VecModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.data2vec(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 21e94075a8dae..129f57a297d9f 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1139,6 +1139,65 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class Data2VecForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecForMaskedLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecForMultipleChoice(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/test_modeling_data2vec.py b/tests/test_modeling_data2vec.py
new file mode 100644
index 0000000000000..69a759b4c6b11
--- /dev/null
+++ b/tests/test_modeling_data2vec.py
@@ -0,0 +1,556 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+from copy import deepcopy
+
+from transformers import Data2VecConfig, is_torch_available
+from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Data2VecForCausalLM,
+        Data2VecForMaskedLM,
+        Data2VecForMultipleChoice,
+        Data2VecForQuestionAnswering,
+        Data2VecForSequenceClassification,
+        Data2VecForTokenClassification,
+        Data2VecModel,
+    )
+    from transformers.models.data2vec.modeling_data2vec import (
+        DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
+        Data2VecEmbeddings,
+        create_position_ids_from_input_ids,
+    )
+
+DATA2VEC_TINY = "sshleifer/tiny-distildata2vec"
+
+
+class Data2VecModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return Data2VecConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = Data2VecModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = Data2VecForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = Data2VecForCausalLM(config=config).to(torch_device).eval()
+
+        # make sure that ids don't start with pad token
+        mask = input_ids.ne(config.pad_token_id).long()
+        input_ids = input_ids * mask
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+
+        # make sure that ids don't start with pad token
+        mask = next_tokens.ne(config.pad_token_id).long()
+        next_tokens = next_tokens * mask
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = Data2VecForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = Data2VecForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Data2VecModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            Data2VecForCausalLM,
+            Data2VecForMaskedLM,
+            Data2VecModel,
+            Data2VecForSequenceClassification,
+            Data2VecForTokenClassification,
+            Data2VecForMultipleChoice,
+            Data2VecForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (Data2VecForCausalLM,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = Data2VecModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Data2VecConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = Data2VecModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_create_position_ids_respects_padding_index(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is Data2VecEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = Data2VecEmbeddings(config=config)
+
+        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
+        expected_positions = torch.as_tensor(
+            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
+        )
+
+        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+    def test_create_position_ids_from_inputs_embeds(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is Data2VecEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        embeddings = Data2VecEmbeddings(config=config)
+
+        inputs_embeds = torch.empty(2, 4, 30)
+        expected_single_positions = [
+            0 + embeddings.padding_idx + 1,
+            1 + embeddings.padding_idx + 1,
+            2 + embeddings.padding_idx + 1,
+            3 + embeddings.padding_idx + 1,
+        ]
+        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
+        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+
+@require_torch
+class Data2VecModelIntegrationTest(TestCasePlus):
+    @slow
+    def test_inference_masked_lm(self):
+        model = Data2VecForMaskedLM.from_pretrained("data2vec")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 50265))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
+        )
+
+        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
+        # data2vec.eval()
+        # expected_slice = data2vec.model.forward(input_ids)[0][:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_no_head(self):
+        model = Data2VecModel.from_pretrained("data2vec")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
+        )
+
+        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
+        # data2vec.eval()
+        # expected_slice = data2vec.extract_features(input_ids)[:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_classification_head(self):
+        model = Data2VecForSequenceClassification.from_pretrained("data2vec-large-mnli")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 3))
+        self.assertEqual(output.shape, expected_shape)
+        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
+
+        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.large.mnli')
+        # data2vec.eval()
+        # expected_tensor = data2vec.predict("mnli", input_ids, return_logits=True).detach()
+
+        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
+
+    # XXX: this might be a candidate for common tests if we have many of those
+    def test_lm_head_ignore_keys(self):
+        keys_to_ignore_on_save_tied = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+        keys_to_ignore_on_save_untied = [r"lm_head.decoder.bias"]
+        config = Data2VecConfig.from_pretrained(DATA2VEC_TINY)
+        config_tied = deepcopy(config)
+        config_tied.tie_word_embeddings = True
+        config_untied = deepcopy(config)
+        config_untied.tie_word_embeddings = False
+        for cls in [Data2VecForMaskedLM, Data2VecForCausalLM]:
+            model = cls(config_tied)
+            self.assertEqual(model._keys_to_ignore_on_save, keys_to_ignore_on_save_tied, cls)
+
+            # the keys should be different when embeddings aren't tied
+            model = cls(config_untied)
+            self.assertEqual(model._keys_to_ignore_on_save, keys_to_ignore_on_save_untied, cls)
+
+            # test that saving works with updated ignore keys - just testing that it doesn't fail
+            model.save_pretrained(self.get_auto_remove_tmp_dir())

From 9e79329ac80a268a9fbf0ddf4b764d850319fad5 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Thu, 3 Feb 2022 15:53:55 -0500
Subject: [PATCH 002/121] Add checkpoint conversion script

---
 ..._original_pytorch_checkpoint_to_pytorch.py | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
index d8c13ff0a1b40..8b8220d35e43f 100644
--- a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
@@ -16,11 +16,11 @@
 
 
 import argparse
+import os
 import pathlib
 
 import fairseq
 import torch
-from fairseq.models.data2vec import Data2VecModel as FairseqData2VecModel
 from fairseq.modules import TransformerSentenceEncoderLayer
 from packaging import version
 
@@ -33,6 +33,7 @@
     BertSelfOutput,
 )
 from transformers.utils import logging
+from .data2vec_text import Data2VecTextModel
 
 
 if version.parse(fairseq.__version__) < version.parse("0.9.0"):
@@ -51,15 +52,17 @@ def convert_data2vec_checkpoint_to_pytorch(
     """
     Copy/paste/tweak data2vec's weights to our BERT structure.
     """
-    data2vec = FairseqData2VecModel.from_pretrained(data2vec_checkpoint_path)
+    data2vec_checkpoint_dir, data2vec_checkpoint_file_name = os.path.split(data2vec_checkpoint_path)
+    data2vec = Data2VecTextModel.from_pretrained(data2vec_checkpoint_dir, checkpoint_file=data2vec_checkpoint_file_name)
     data2vec.eval()  # disable dropout
-    data2vec_sent_encoder = data2vec.model.encoder.sentence_encoder
+    data2vec_model = data2vec.models[0]
+    data2vec_sent_encoder = data2vec_model.encoder.sentence_encoder
     config = Data2VecConfig(
         vocab_size=data2vec_sent_encoder.embed_tokens.num_embeddings,
-        hidden_size=data2vec.args.encoder_embed_dim,
-        num_hidden_layers=data2vec.args.encoder_layers,
-        num_attention_heads=data2vec.args.encoder_attention_heads,
-        intermediate_size=data2vec.args.encoder_ffn_embed_dim,
+        hidden_size=data2vec_model.args.encoder_embed_dim,
+        num_hidden_layers=data2vec_model.args.encoder_layers,
+        num_attention_heads=data2vec_model.args.encoder_attention_heads,
+        intermediate_size=data2vec_model.args.encoder_ffn_embed_dim,
         max_position_embeddings=514,
         type_vocab_size=1,
         layer_norm_eps=1e-5,  # PyTorch default used in fairseq
@@ -78,8 +81,8 @@ def convert_data2vec_checkpoint_to_pytorch(
     model.data2vec.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
         model.data2vec.embeddings.token_type_embeddings.weight
     )  # just zero them out b/c data2vec doesn't use them.
-    model.data2vec.embeddings.LayerNorm.weight = data2vec_sent_encoder.emb_layer_norm.weight
-    model.data2vec.embeddings.LayerNorm.bias = data2vec_sent_encoder.emb_layer_norm.bias
+    model.data2vec.embeddings.LayerNorm.weight = data2vec_sent_encoder.layernorm_embedding.weight
+    model.data2vec.embeddings.LayerNorm.bias = data2vec_sent_encoder.layernorm_embedding.bias
 
     for i in range(config.num_hidden_layers):
         # Encoder: start of layer
@@ -132,12 +135,12 @@ def convert_data2vec_checkpoint_to_pytorch(
         model.classifier.out_proj.bias = data2vec.model.classification_heads["mnli"].out_proj.bias
     else:
         # LM Head
-        model.lm_head.dense.weight = data2vec.model.encoder.lm_head.dense.weight
-        model.lm_head.dense.bias = data2vec.model.encoder.lm_head.dense.bias
-        model.lm_head.layer_norm.weight = data2vec.model.encoder.lm_head.layer_norm.weight
-        model.lm_head.layer_norm.bias = data2vec.model.encoder.lm_head.layer_norm.bias
-        model.lm_head.decoder.weight = data2vec.model.encoder.lm_head.weight
-        model.lm_head.decoder.bias = data2vec.model.encoder.lm_head.bias
+        model.lm_head.dense.weight = data2vec_model.encoder.lm_head.dense.weight
+        model.lm_head.dense.bias = data2vec_model.encoder.lm_head.dense.bias
+        model.lm_head.layer_norm.weight = data2vec_model.encoder.lm_head.layer_norm.weight
+        model.lm_head.layer_norm.bias = data2vec_model.encoder.lm_head.layer_norm.bias
+        model.lm_head.decoder.weight = data2vec_model.encoder.lm_head.weight
+        model.lm_head.decoder.bias = data2vec_model.encoder.lm_head.bias
 
     # Let's check that we get the same results.
     input_ids: torch.Tensor = data2vec.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
@@ -146,7 +149,7 @@ def convert_data2vec_checkpoint_to_pytorch(
     if classification_head:
         their_output = data2vec.model.classification_heads["mnli"](data2vec.extract_features(input_ids))
     else:
-        their_output = data2vec.model(input_ids)[0]
+        their_output = data2vec_model(input_ids)[0]
     print(our_output.shape, their_output.shape)
     max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
     print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7

From 47e6bbce2f66086bc8114362677a2965454df6bd Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Thu, 3 Feb 2022 16:12:35 -0500
Subject: [PATCH 003/121] Fix copies

---
 src/transformers/models/data2vec/modeling_data2vec.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 85cb88f510a66..172958898535b 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -62,7 +62,6 @@
 ]
 
 
-
 # Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Data2Vec
 class Data2VecEmbeddings(nn.Module):
     """

From 5e70a951896f8558df23b3321cf62b9da68b3fa4 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Thu, 3 Feb 2022 16:21:24 -0500
Subject: [PATCH 004/121] Update docs

---
 src/transformers/models/data2vec/modeling_data2vec.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 172958898535b..dbdfcd447ca55 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -952,13 +952,13 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import Data2VecTokenizer, Data2VecForCausalLM, Data2VecConfig
+        >>> from transformers import RobertaTokenizer, Data2VecForSequenceClassification, Data2VecConfig
         >>> import torch
 
-        >>> tokenizer = Data2VecTokenizer.from_pretrained("data2vec-base")
-        >>> config = Data2VecConfig.from_pretrained("data2vec-base")
-        >>> config.is_decoder = True
-        >>> model = Data2VecForCausalLM.from_pretrained("data2vec-base", config=config)
+        >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
+        >>> config = Data2VecConfig.from_pretrained("edugp/data2vec-nlp-base")
+        >>> model = Data2VecForSequenceClassification.from_pretrained("edugp/data2vec-nlp-base", config=config)
+        >>> # Fine-tune this model
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)

From 2eddba96edb804dcfc39b102c0d9d41a287ee776 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Thu, 3 Feb 2022 16:29:13 -0500
Subject: [PATCH 005/121] Add checkpoint conversion script

---
 ..._original_pytorch_checkpoint_to_pytorch.py | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
index d8c13ff0a1b40..8b8220d35e43f 100644
--- a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
@@ -16,11 +16,11 @@
 
 
 import argparse
+import os
 import pathlib
 
 import fairseq
 import torch
-from fairseq.models.data2vec import Data2VecModel as FairseqData2VecModel
 from fairseq.modules import TransformerSentenceEncoderLayer
 from packaging import version
 
@@ -33,6 +33,7 @@
     BertSelfOutput,
 )
 from transformers.utils import logging
+from .data2vec_text import Data2VecTextModel
 
 
 if version.parse(fairseq.__version__) < version.parse("0.9.0"):
@@ -51,15 +52,17 @@ def convert_data2vec_checkpoint_to_pytorch(
     """
     Copy/paste/tweak data2vec's weights to our BERT structure.
     """
-    data2vec = FairseqData2VecModel.from_pretrained(data2vec_checkpoint_path)
+    data2vec_checkpoint_dir, data2vec_checkpoint_file_name = os.path.split(data2vec_checkpoint_path)
+    data2vec = Data2VecTextModel.from_pretrained(data2vec_checkpoint_dir, checkpoint_file=data2vec_checkpoint_file_name)
     data2vec.eval()  # disable dropout
-    data2vec_sent_encoder = data2vec.model.encoder.sentence_encoder
+    data2vec_model = data2vec.models[0]
+    data2vec_sent_encoder = data2vec_model.encoder.sentence_encoder
     config = Data2VecConfig(
         vocab_size=data2vec_sent_encoder.embed_tokens.num_embeddings,
-        hidden_size=data2vec.args.encoder_embed_dim,
-        num_hidden_layers=data2vec.args.encoder_layers,
-        num_attention_heads=data2vec.args.encoder_attention_heads,
-        intermediate_size=data2vec.args.encoder_ffn_embed_dim,
+        hidden_size=data2vec_model.args.encoder_embed_dim,
+        num_hidden_layers=data2vec_model.args.encoder_layers,
+        num_attention_heads=data2vec_model.args.encoder_attention_heads,
+        intermediate_size=data2vec_model.args.encoder_ffn_embed_dim,
         max_position_embeddings=514,
         type_vocab_size=1,
         layer_norm_eps=1e-5,  # PyTorch default used in fairseq
@@ -78,8 +81,8 @@ def convert_data2vec_checkpoint_to_pytorch(
     model.data2vec.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
         model.data2vec.embeddings.token_type_embeddings.weight
     )  # just zero them out b/c data2vec doesn't use them.
-    model.data2vec.embeddings.LayerNorm.weight = data2vec_sent_encoder.emb_layer_norm.weight
-    model.data2vec.embeddings.LayerNorm.bias = data2vec_sent_encoder.emb_layer_norm.bias
+    model.data2vec.embeddings.LayerNorm.weight = data2vec_sent_encoder.layernorm_embedding.weight
+    model.data2vec.embeddings.LayerNorm.bias = data2vec_sent_encoder.layernorm_embedding.bias
 
     for i in range(config.num_hidden_layers):
         # Encoder: start of layer
@@ -132,12 +135,12 @@ def convert_data2vec_checkpoint_to_pytorch(
         model.classifier.out_proj.bias = data2vec.model.classification_heads["mnli"].out_proj.bias
     else:
         # LM Head
-        model.lm_head.dense.weight = data2vec.model.encoder.lm_head.dense.weight
-        model.lm_head.dense.bias = data2vec.model.encoder.lm_head.dense.bias
-        model.lm_head.layer_norm.weight = data2vec.model.encoder.lm_head.layer_norm.weight
-        model.lm_head.layer_norm.bias = data2vec.model.encoder.lm_head.layer_norm.bias
-        model.lm_head.decoder.weight = data2vec.model.encoder.lm_head.weight
-        model.lm_head.decoder.bias = data2vec.model.encoder.lm_head.bias
+        model.lm_head.dense.weight = data2vec_model.encoder.lm_head.dense.weight
+        model.lm_head.dense.bias = data2vec_model.encoder.lm_head.dense.bias
+        model.lm_head.layer_norm.weight = data2vec_model.encoder.lm_head.layer_norm.weight
+        model.lm_head.layer_norm.bias = data2vec_model.encoder.lm_head.layer_norm.bias
+        model.lm_head.decoder.weight = data2vec_model.encoder.lm_head.weight
+        model.lm_head.decoder.bias = data2vec_model.encoder.lm_head.bias
 
     # Let's check that we get the same results.
     input_ids: torch.Tensor = data2vec.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
@@ -146,7 +149,7 @@ def convert_data2vec_checkpoint_to_pytorch(
     if classification_head:
         their_output = data2vec.model.classification_heads["mnli"](data2vec.extract_features(input_ids))
     else:
-        their_output = data2vec.model(input_ids)[0]
+        their_output = data2vec_model(input_ids)[0]
     print(our_output.shape, their_output.shape)
     max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
     print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7

From 991e6d92241d03c60f9726e4821a87f62d7372cd Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 15:30:04 -0500
Subject: [PATCH 006/121] Remove fairseq data2vec_text script and fix format

---
 ..._original_pytorch_checkpoint_to_pytorch.py |   5 +-
 .../models/data2vec/data2vec_text.py          | 519 ------------------
 2 files changed, 4 insertions(+), 520 deletions(-)
 delete mode 100644 src/transformers/models/data2vec/data2vec_text.py

diff --git a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
index 8b8220d35e43f..bc4e41e4e058a 100644
--- a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
@@ -33,6 +33,7 @@
     BertSelfOutput,
 )
 from transformers.utils import logging
+
 from .data2vec_text import Data2VecTextModel
 
 
@@ -53,7 +54,9 @@ def convert_data2vec_checkpoint_to_pytorch(
     Copy/paste/tweak data2vec's weights to our BERT structure.
     """
     data2vec_checkpoint_dir, data2vec_checkpoint_file_name = os.path.split(data2vec_checkpoint_path)
-    data2vec = Data2VecTextModel.from_pretrained(data2vec_checkpoint_dir, checkpoint_file=data2vec_checkpoint_file_name)
+    data2vec = Data2VecTextModel.from_pretrained(
+        data2vec_checkpoint_dir, checkpoint_file=data2vec_checkpoint_file_name
+    )
     data2vec.eval()  # disable dropout
     data2vec_model = data2vec.models[0]
     data2vec_sent_encoder = data2vec_model.encoder.sentence_encoder
diff --git a/src/transformers/models/data2vec/data2vec_text.py b/src/transformers/models/data2vec/data2vec_text.py
deleted file mode 100644
index 5cc1d37e842aa..0000000000000
--- a/src/transformers/models/data2vec/data2vec_text.py
+++ /dev/null
@@ -1,519 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from dataclasses import dataclass, field
-from typing import Optional
-import logging
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from omegaconf import II
-
-from fairseq.dataclass import FairseqDataclass
-from fairseq.dataclass.configs import EMAConfig
-from fairseq.models.ema import EMA
-from fairseq.models import (
-    FairseqEncoder,
-    FairseqEncoderModel,
-    register_model,
-)
-from fairseq.models.roberta.model import RobertaLMHead, RobertaClassificationHead
-from fairseq.models.transformer import TransformerEncoder, TransformerConfig
-from fairseq.modules.transformer_sentence_encoder import init_bert_params
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class Data2VecTextConfig(FairseqDataclass):
-    max_positions: int = II("task.tokens_per_sample")
-
-    head_layers: int = 1
-
-    transformer: TransformerConfig = TransformerConfig()
-
-    load_checkpoint_heads: bool = field(
-        default=False,
-        metadata={"help": "(re-)register and load heads when loading checkpoints"},
-    )
-
-    loss_beta: float = field(
-        default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"}
-    )
-    loss_scale: Optional[float] = field(
-        default=None,
-        metadata={
-            "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"
-        },
-    )
-    average_top_k_layers: int = field(
-        default=8, metadata={"help": "how many layers to average"}
-    )
-
-    layer_norm_target_layer: bool = False
-    instance_norm_target_layer: bool = False
-    batch_norm_target_layer: bool = False
-    instance_norm_targets: bool = False
-    layer_norm_targets: bool = False
-
-    ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"})
-    ema_end_decay: float = field(
-        default=0.9999, metadata={"help": "final ema decay rate"}
-    )
-
-    # when to finish annealing ema decay rate
-    ema_anneal_end_step: int = II("optimization.max_update")
-
-    ema_transformer_layers_only: bool = field(
-        default=True,
-        metadata={"help": "whether to momentum update only the transformer layers"},
-    )
-
-
-def get_annealed_rate(start, end, curr_step, total_steps):
-    r = end - start
-    pct_remaining = 1 - curr_step / total_steps
-    return end - r * pct_remaining
-
-
-@register_model("data2vec_text", dataclass=Data2VecTextConfig)
-class Data2VecTextModel(FairseqEncoderModel):
-    def __init__(self, cfg: Data2VecTextConfig, encoder):
-        super().__init__(encoder)
-        self.cfg = cfg
-
-        # We follow BERT's random weight initialization
-        self.apply(init_bert_params)
-
-        self.classification_heads = nn.ModuleDict()
-
-    @classmethod
-    def build_model(cls, cfg, task):
-        """Build a new model instance."""
-
-        encoder = Data2VecTextEncoder(cfg, task.source_dictionary, task.cfg.data)
-
-        return cls(cfg, encoder)
-
-    def forward(
-        self,
-        src_tokens,
-        target_tokens=None,
-        features_only=False,
-        return_all_hiddens=False,
-        classification_head_name=None,
-        **kwargs,
-    ):
-        if classification_head_name is not None:
-            features_only = True
-
-        res = self.encoder(
-            src_tokens, target_tokens, features_only, return_all_hiddens, **kwargs
-        )
-
-        if isinstance(res, tuple):
-            x, extra = res
-        else:
-            return res
-
-        if classification_head_name is not None:
-            x = self.classification_heads[classification_head_name](x)
-        return x, extra
-
-    def get_normalized_probs(self, net_output, log_probs, sample=None):
-        """Get normalized probabilities (or log probs) from a net's output."""
-        logits = net_output[0].float()
-        if log_probs:
-            return F.log_softmax(logits, dim=-1)
-        else:
-            return F.softmax(logits, dim=-1)
-
-    def register_classification_head(
-        self, name, num_classes=None, inner_dim=None, **kwargs
-    ):
-        """Register a classification head."""
-        if name in self.classification_heads:
-            prev_num_classes = self.classification_heads[name].out_proj.out_features
-            prev_inner_dim = self.classification_heads[name].dense.out_features
-            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
-                logger.warning(
-                    're-registering head "{}" with num_classes {} (prev: {}) '
-                    "and inner_dim {} (prev: {})".format(
-                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
-                    )
-                )
-        self.classification_heads[name] = RobertaClassificationHead(
-            input_dim=self.cfg.transformer.encoder.embed_dim,
-            inner_dim=inner_dim or self.cfg.transformer.encoder.embed_dim,
-            num_classes=num_classes,
-            activation_fn="tanh",
-            pooler_dropout=0,
-        )
-
-    @property
-    def supported_targets(self):
-        return {"self"}
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        prefix = name + "." if name != "" else ""
-
-        # rename decoder -> encoder before upgrading children modules
-        for k in list(state_dict.keys()):
-            if k.startswith(prefix + "decoder"):
-                new_k = prefix + "encoder" + k[len(prefix + "decoder") :]
-                state_dict[new_k] = state_dict[k]
-                del state_dict[k]
-
-        # rename emb_layer_norm -> layernorm_embedding
-        for k in list(state_dict.keys()):
-            if ".emb_layer_norm." in k:
-                new_k = k.replace(".emb_layer_norm.", ".layernorm_embedding.")
-                state_dict[new_k] = state_dict[k]
-                del state_dict[k]
-
-            if self.encoder.regression_head is not None:
-                if ".lm_head." in k:
-                    new_k = k.replace(".lm_head.", ".regression_head.")
-                    state_dict[new_k] = state_dict[k]
-                    del state_dict[k]
-            else:
-                if ".regression_head." in k:
-                    del state_dict[k]
-
-        # upgrade children modules
-        super().upgrade_state_dict_named(state_dict, name)
-
-        # Handle new classification heads present in the state dict.
-        current_head_names = (
-            []
-            if not hasattr(self, "classification_heads")
-            or self.classification_heads is None
-            else self.classification_heads.keys()
-        )
-        keys_to_delete = []
-        for k in state_dict.keys():
-            if not k.startswith(prefix + "classification_heads."):
-                continue
-
-            head_name = k[len(prefix + "classification_heads.") :].split(".")[0]
-            num_classes = state_dict[
-                prefix + "classification_heads." + head_name + ".out_proj.weight"
-            ].size(0)
-            inner_dim = state_dict[
-                prefix + "classification_heads." + head_name + ".dense.weight"
-            ].size(0)
-
-            if self.cfg.load_checkpoint_heads:
-                if head_name not in current_head_names:
-                    self.register_classification_head(head_name, num_classes, inner_dim)
-            else:
-                if head_name not in current_head_names:
-                    logger.warning(
-                        "deleting classification head ({}) from checkpoint "
-                        "not present in current model: {}".format(head_name, k)
-                    )
-                    keys_to_delete.append(k)
-                elif (
-                    num_classes
-                    != self.classification_heads[head_name].out_proj.out_features
-                    or inner_dim
-                    != self.classification_heads[head_name].dense.out_features
-                ):
-                    logger.warning(
-                        "deleting classification head ({}) from checkpoint "
-                        "with different dimensions than current model: {}".format(
-                            head_name, k
-                        )
-                    )
-                    keys_to_delete.append(k)
-        for k in keys_to_delete:
-            del state_dict[k]
-
-        # Copy any newly-added classification heads into the state dict
-        # with their current weights.
-        if (
-            hasattr(self, "classification_heads")
-            and self.classification_heads is not None
-            and len(self.classification_heads) > 0
-        ):
-            cur_state = self.classification_heads.state_dict()
-            for k, v in cur_state.items():
-                if prefix + "classification_heads." + k not in state_dict:
-                    logger.info("Overwriting " + prefix + "classification_heads." + k)
-                    state_dict[prefix + "classification_heads." + k] = v
-
-            for k in list(state_dict.keys()):
-                if k.startswith(prefix + "encoder.lm_head.") or k.startswith(
-                    prefix + "encoder.emb_head."
-                ):
-                    del state_dict[k]
-
-            self.encoder.lm_head = None
-
-        if self.encoder.target_model is None:
-            for k in list(state_dict.keys()):
-                if k.startswith(prefix + "encoder.target_model."):
-                    del state_dict[k]
-
-        if (self.encoder.ema is None) and (prefix + "encoder._ema" in state_dict):
-            del state_dict[prefix + "encoder._ema"]
-
-    def remove_pretraining_modules(self, last_layer=None):
-        self.encoder.lm_head = None
-        self.encoder.regression_head = None
-        self.encoder.ema = None
-        self.classification_heads = None
-
-        if last_layer is not None:
-            self.encoder.sentence_encoder.layers = nn.ModuleList(
-                l
-                for i, l in enumerate(self.encoder.sentence_encoder.layers)
-                if i <= last_layer
-            )
-            self.encoder.sentence_encoder.layer_norm = None
-
-
-class Data2VecTextEncoder(FairseqEncoder):
-    def __init__(self, cfg: Data2VecTextConfig, dictionary, task_data):
-        super().__init__(dictionary)
-
-        # HACK
-        self.target_model = None
-
-        self.cfg = cfg
-
-        embed_tokens = self.build_embedding(
-            len(dictionary), cfg.transformer.encoder.embed_dim, dictionary.pad()
-        )
-
-        self.sentence_encoder = self.build_encoder(cfg, dictionary, embed_tokens)
-        self.mask_idx = dictionary.index("<mask>")
-        assert self.mask_idx != dictionary.unk(), dictionary.symbols
-
-        self.ema = None
-        self.average_top_k_layers = cfg.average_top_k_layers
-        self.loss_scale = cfg.loss_scale
-
-        assert self.cfg.head_layers >= 1
-
-        embed_dim = cfg.transformer.encoder.embed_dim
-        curr_dim = embed_dim
-        projs = []
-        for i in range(self.cfg.head_layers - 1):
-            next_dim = embed_dim * 2 if i == 0 else curr_dim
-            projs.append(nn.Linear(curr_dim, next_dim))
-            projs.append(nn.GELU())
-            curr_dim = next_dim
-
-        projs.append(nn.Linear(curr_dim, embed_dim))
-        self.regression_head = nn.Sequential(*projs)
-
-        self.num_updates = 0
-
-    def build_embedding(self, vocab_size, embedding_dim, padding_idx):
-        return nn.Embedding(vocab_size, embedding_dim, padding_idx)
-
-    def build_encoder(self, cfg, dictionary, embed_tokens):
-        encoder = TransformerEncoder(cfg.transformer, dictionary, embed_tokens, return_fc=True)
-        encoder.apply(init_bert_params)
-        return encoder
-
-    def build_lm_head(self, embed_dim, output_dim, activation_fn, weight):
-        return RobertaLMHead(embed_dim, output_dim, activation_fn, weight)
-
-    def make_ema_teacher(self):
-        ema_config = EMAConfig(
-            ema_decay=self.cfg.ema_decay,
-            ema_fp32=True,
-        )
-        skip_keys = set()
-        if self.cfg.ema_transformer_layers_only:
-            for k, _ in self.sentence_encoder.embed_positions.named_parameters():
-                skip_keys.add(f"embed_tokens.{k}")
-            for k, _ in self.sentence_encoder.embed_positions.named_parameters():
-                skip_keys.add(f"embed_positions.{k}")
-            if self.sentence_encoder.layernorm_embedding is not None:
-                for (
-                    k,
-                    _,
-                ) in self.sentence_encoder.layernorm_embedding.named_parameters():
-                    skip_keys.add(f"layernorm_embedding.{k}")
-            if self.sentence_encoder.layer_norm is not None:
-                for k, _ in self.sentence_encoder.layer_norm.named_parameters():
-                    skip_keys.add(f"layernorm_embedding.{k}")
-
-        self.ema = EMA(
-            self.sentence_encoder,
-            ema_config,
-            skip_keys=skip_keys,
-        )
-
-    def set_num_updates(self, num_updates):
-        super().set_num_updates(num_updates)
-
-        if self.ema is None and self.regression_head is not None:
-            logger.info(f"making ema teacher")
-            self.make_ema_teacher()
-        elif self.training and self.ema is not None:
-            if self.cfg.ema_decay != self.cfg.ema_end_decay:
-                if num_updates >= self.cfg.ema_anneal_end_step:
-                    decay = self.cfg.ema_end_decay
-                else:
-                    decay = get_annealed_rate(
-                        self.cfg.ema_decay,
-                        self.cfg.ema_end_decay,
-                        num_updates,
-                        self.cfg.ema_anneal_end_step,
-                    )
-                self.ema._set_decay(decay)
-            if self.ema.get_decay() < 1:
-                self.ema.step(self.sentence_encoder)
-
-    def state_dict(self, destination=None, prefix="", keep_vars=False):
-        state = super().state_dict(destination, prefix, keep_vars)
-        if self.ema is not None:
-            state[prefix + "_ema"] = self.ema.fp32_params
-        return state
-
-    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
-        if self.ema is not None:
-            k = prefix + "_ema"
-            assert k in state_dict
-            self.ema.restore(state_dict[k], True)
-            del state_dict[k]
-        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
-
-    def forward(
-        self,
-        src_tokens,
-        target_tokens=None,
-        features_only=False,
-        return_all_hiddens=False,
-        masked_tokens=None,
-        **unused,
-    ):
-        """
-        Args:
-            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
-            features_only (bool, optional): skip LM head and just return
-                features. If True, the output will be of shape `(batch, src_len, embed_dim)`.
-            return_all_hiddens (bool, optional): also return all of the
-                intermediate hidden states (default: False).
-
-        Returns:
-            tuple:
-                - the LM output of shape `(batch, src_len, vocab)`
-                - a dictionary of additional data, where 'inner_states' is a list of hidden states. Note that the
-                  hidden states have shape `(src_len, batch, vocab)`.
-        """
-
-        x, extra = self.extract_features(
-            src_tokens, return_all_hiddens=return_all_hiddens
-        )
-
-        if features_only:
-            return x, extra
-
-        assert target_tokens is not None
-
-        with torch.no_grad():
-            # use EMA parameter as the teacher
-            self.ema.model.eval()
-
-            encoder_out = self.ema.model(
-                target_tokens,
-                return_all_hiddens=True,
-            )
-            y = encoder_out["fc_results"]
-
-            y = y[-self.average_top_k_layers :]
-
-            permuted = False
-            if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer:
-                y = [tl.permute(1, 2, 0) for tl in y]  # TBC -> BCT
-                permuted = True
-
-            if self.cfg.batch_norm_target_layer:
-                y = [
-                    F.batch_norm(
-                        tl.float(), running_mean=None, running_var=None, training=True
-                    )
-                    for tl in y
-                ]
-
-            if self.cfg.instance_norm_target_layer:
-                y = [F.instance_norm(tl.float()) for tl in y]
-
-            if permuted:
-                y = [tl.transpose(1, 2) for tl in y]  # BCT -> BTC
-
-            if self.cfg.layer_norm_target_layer:
-                y = [F.layer_norm(tl.float(), tl.shape[-1:]) for tl in y]
-
-            y = sum(y) / len(y)
-
-            if not permuted:
-                y = y.transpose(0, 1)
-
-            if self.cfg.layer_norm_targets:
-                y = F.layer_norm(y.float(), y.shape[-1:])
-
-            if self.cfg.instance_norm_targets:
-                y = F.instance_norm(y.transpose(1, 2)).transpose(1, 2)
-
-        masked_indices = src_tokens.eq(self.mask_idx)
-
-        x = x[masked_indices]
-        y = y[masked_indices]
-
-        x = self.regression_head(x)
-
-        sz = x.size(-1)
-        if self.cfg.loss_beta == 0:
-            loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1)
-        else:
-            loss = F.smooth_l1_loss(
-                x.float(), y.float(), reduction="none", beta=self.cfg.loss_beta
-            ).sum(dim=-1)
-
-        result = {
-            "losses": {
-                "main": loss.sum() / math.sqrt(sz)
-                if self.loss_scale <= 0
-                else loss.sum() * self.loss_scale,
-            },
-            "sample_size": loss.numel(),
-        }
-
-        # logging other values
-        other_logs = {
-            "ema_decay": self.ema.get_decay() * 1000
-        }
-        result["logs"] = other_logs
-        return result
-
-    def extract_features(self, src_tokens, return_all_hiddens=False, **kwargs):
-        encoder_out = self.sentence_encoder(
-            src_tokens,
-            return_all_hiddens=return_all_hiddens,
-            token_embeddings=kwargs.get("token_embeddings", None),
-        )
-        # T x B x C -> B x T x C
-        features = encoder_out["encoder_out"][0].transpose(0, 1)
-        inner_states = encoder_out["encoder_states"] if return_all_hiddens else None
-        return features, {
-            "inner_states": inner_states,
-            "encoder_embedding": encoder_out["encoder_embedding"][0],
-        }
-
-    def output_layer(self, features, masked_tokens=None, **unused):
-        return self.lm_head(features, masked_tokens)
-
-    def max_positions(self):
-        """Maximum output length supported by the encoder."""
-        return self.cfg.max_positions

From caeb28d44a569c8d16f010ba6e5628273a91e59a Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 15:32:52 -0500
Subject: [PATCH 007/121] Add comment on where to get data2vec_text.py

---
 ...convert_data2vec_original_pytorch_checkpoint_to_pytorch.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
index bc4e41e4e058a..9861e4a6f451e 100644
--- a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
@@ -34,7 +34,9 @@
 )
 from transformers.utils import logging
 
-from .data2vec_text import Data2VecTextModel
+from .data2vec_text import (
+    Data2VecTextModel,  # Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
+)
 
 
 if version.parse(fairseq.__version__) < version.parse("0.9.0"):

From 4c0565e92e8b0be74cfd83ff81a7779a73651cd0 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 15:37:16 -0500
Subject: [PATCH 008/121] Remove mock implementation cheat.py and fix style

---
 src/transformers/models/data2vec/cheat.py     | 40 -------------------
 .../models/data2vec/configuration_data2vec.py |  1 -
 2 files changed, 41 deletions(-)
 delete mode 100644 src/transformers/models/data2vec/cheat.py

diff --git a/src/transformers/models/data2vec/cheat.py b/src/transformers/models/data2vec/cheat.py
deleted file mode 100644
index b043bef5a3547..0000000000000
--- a/src/transformers/models/data2vec/cheat.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import os
-import tarfile
-import urllib.request
-
-# Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
-from .data2vec_text import Data2VecTextModel
-
-
-class Data2VecFairseqProxy():
-    def __init__(self, module):
-        self.module = module
-
-    @classmethod
-    def from_pretrained(cls, mname):
-        ckpt = f"{mname}.pt"
-        cls._download_weights(model=ckpt)
-        return cls(Data2VecTextModel.from_pretrained("roberta/roberta.large", checkpoint_file=ckpt))
-    
-    @staticmethod
-    def _download_weights(model: str="nlp_base.pt"):
-        assert model in ("nlp_base.pt", "audio_base_ls.pt"), "Weights not found"
-        root_url = "https://dl.fbaipublicfiles.com/fairseq"
-
-        if model == "nlp_base.pt":
-            # Need download RoBERTa first to get the dictionary file
-            if not os.path.isdir("roberta"):
-                print("Downloading roberta")
-                urllib.request.urlretrieve(f"{root_url}/models/roberta.large.tar.gz", "roberta.large.tar.gz")
-                with tarfile.open("roberta.large.tar.gz") as f:
-                    f.extractall("roberta")
-                # Remove Roberta model weights and tar file
-                os.remove(os.path.join("roberta", "roberta.large", "model.pt"))
-                os.remove(os.path.join("roberta.large.tar.gz"))
-
-        # Then download the actual data2vec weights
-        model_url = f"{root_url}/data2vec/{model}"
-        model_path = os.path.join("roberta", "roberta.large", model)
-        if not os.path.isfile(model_path):
-            print("Downloading model...")
-            urllib.request.urlretrieve(model_url, model_path)
diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index 43a80373ae748..f4929e2854d16 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -29,7 +29,6 @@
 }
 
 
-
 class Data2VecConfig(BertConfig):
     r"""
     This is the configuration class to store the configuration of a [`Data2VecModel`] or a [`TFData2VecModel`]. It is

From 35170388e5f44de6b174b767fc44c3e6cb0b1dcd Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 15:41:26 -0500
Subject: [PATCH 009/121] Fix copies

---
 src/transformers/models/data2vec/modeling_data2vec.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index dbdfcd447ca55..172958898535b 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -952,13 +952,13 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import RobertaTokenizer, Data2VecForSequenceClassification, Data2VecConfig
+        >>> from transformers import Data2VecTokenizer, Data2VecForCausalLM, Data2VecConfig
         >>> import torch
 
-        >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
-        >>> config = Data2VecConfig.from_pretrained("edugp/data2vec-nlp-base")
-        >>> model = Data2VecForSequenceClassification.from_pretrained("edugp/data2vec-nlp-base", config=config)
-        >>> # Fine-tune this model
+        >>> tokenizer = Data2VecTokenizer.from_pretrained("data2vec-base")
+        >>> config = Data2VecConfig.from_pretrained("data2vec-base")
+        >>> config.is_decoder = True
+        >>> model = Data2VecForCausalLM.from_pretrained("data2vec-base", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)

From 54d0f37e5039f829c114a68fe2181267e2b94f1c Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 15:45:14 -0500
Subject: [PATCH 010/121] Remove TF and Flax classes from init

---
 docs/source/index.mdx        |  2 +-
 src/transformers/__init__.py | 25 -------------------------
 2 files changed, 1 insertion(+), 26 deletions(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 42b3df240bd08..8219f3b89e006 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -177,7 +177,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          data2vec           |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          data2vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 697d1dd7b95e9..ebc184f567b58 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1709,20 +1709,6 @@
             "TFCTRLPreTrainedModel",
         ]
     )
-    _import_structure["models.data2vec"].extend(
-        [
-            "TF_DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "TFData2VecForCausalLM",
-            "TFData2VecForMaskedLM",
-            "TFData2VecForMultipleChoice",
-            "TFData2VecForQuestionAnswering",
-            "TFData2VecForSequenceClassification",
-            "TFData2VecForTokenClassification",
-            "TFData2VecMainLayer",
-            "TFData2VecModel",
-            "TFData2VecPreTrainedModel",
-        ]
-    )
     _import_structure["models.deberta"].extend(
         [
             "TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2175,17 +2161,6 @@
             "FlaxCLIPVisionPreTrainedModel",
         ]
     )
-    _import_structure["models.data2vec"].extend(
-        [
-            "FlaxData2VecForMaskedLM",
-            "FlaxData2VecForMultipleChoice",
-            "FlaxData2VecForQuestionAnswering",
-            "FlaxData2VecForSequenceClassification",
-            "FlaxData2VecForTokenClassification",
-            "FlaxData2VecModel",
-            "FlaxData2VecPreTrainedModel",
-        ]
-    )
     _import_structure["models.distilbert"].extend(
         [
             "FlaxDistilBertForMaskedLM",

From 81f36f7ac5d78ce9f0658b7309f3e94e016328a0 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 15:57:26 -0500
Subject: [PATCH 011/121] Add back copy from fairseq data2vec_text.py and fix
 style

---
 ..._original_pytorch_checkpoint_to_pytorch.py |   5 +-
 .../models/data2vec/data2vec_text.py          | 472 ++++++++++++++++++
 2 files changed, 474 insertions(+), 3 deletions(-)
 create mode 100644 src/transformers/models/data2vec/data2vec_text.py

diff --git a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
index 9861e4a6f451e..973af5144bcb6 100644
--- a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
@@ -34,9 +34,8 @@
 )
 from transformers.utils import logging
 
-from .data2vec_text import (
-    Data2VecTextModel,  # Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
-)
+# File copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
+from .data2vec_text import Data2VecTextModel
 
 
 if version.parse(fairseq.__version__) < version.parse("0.9.0"):
diff --git a/src/transformers/models/data2vec/data2vec_text.py b/src/transformers/models/data2vec/data2vec_text.py
new file mode 100644
index 0000000000000..d8cef5f342224
--- /dev/null
+++ b/src/transformers/models/data2vec/data2vec_text.py
@@ -0,0 +1,472 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.configs import EMAConfig
+from fairseq.models import FairseqEncoder, FairseqEncoderModel, register_model
+from fairseq.models.ema import EMA
+from fairseq.models.roberta.model import RobertaClassificationHead, RobertaLMHead
+from fairseq.models.transformer import TransformerConfig, TransformerEncoder
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+from omegaconf import II
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Data2VecTextConfig(FairseqDataclass):
+    max_positions: int = II("task.tokens_per_sample")
+
+    head_layers: int = 1
+
+    transformer: TransformerConfig = TransformerConfig()
+
+    load_checkpoint_heads: bool = field(
+        default=False,
+        metadata={"help": "(re-)register and load heads when loading checkpoints"},
+    )
+
+    loss_beta: float = field(default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"})
+    loss_scale: Optional[float] = field(
+        default=None,
+        metadata={"help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"},
+    )
+    average_top_k_layers: int = field(default=8, metadata={"help": "how many layers to average"})
+
+    layer_norm_target_layer: bool = False
+    instance_norm_target_layer: bool = False
+    batch_norm_target_layer: bool = False
+    instance_norm_targets: bool = False
+    layer_norm_targets: bool = False
+
+    ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"})
+    ema_end_decay: float = field(default=0.9999, metadata={"help": "final ema decay rate"})
+
+    # when to finish annealing ema decay rate
+    ema_anneal_end_step: int = II("optimization.max_update")
+
+    ema_transformer_layers_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer layers"},
+    )
+
+
+def get_annealed_rate(start, end, curr_step, total_steps):
+    r = end - start
+    pct_remaining = 1 - curr_step / total_steps
+    return end - r * pct_remaining
+
+
+@register_model("data2vec_text", dataclass=Data2VecTextConfig)
+class Data2VecTextModel(FairseqEncoderModel):
+    def __init__(self, cfg: Data2VecTextConfig, encoder):
+        super().__init__(encoder)
+        self.cfg = cfg
+
+        # We follow BERT's random weight initialization
+        self.apply(init_bert_params)
+
+        self.classification_heads = nn.ModuleDict()
+
+    @classmethod
+    def build_model(cls, cfg, task):
+        """Build a new model instance."""
+
+        encoder = Data2VecTextEncoder(cfg, task.source_dictionary, task.cfg.data)
+
+        return cls(cfg, encoder)
+
+    def forward(
+        self,
+        src_tokens,
+        target_tokens=None,
+        features_only=False,
+        return_all_hiddens=False,
+        classification_head_name=None,
+        **kwargs,
+    ):
+        if classification_head_name is not None:
+            features_only = True
+
+        res = self.encoder(src_tokens, target_tokens, features_only, return_all_hiddens, **kwargs)
+
+        if isinstance(res, tuple):
+            x, extra = res
+        else:
+            return res
+
+        if classification_head_name is not None:
+            x = self.classification_heads[classification_head_name](x)
+        return x, extra
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        logits = net_output[0].float()
+        if log_probs:
+            return F.log_softmax(logits, dim=-1)
+        else:
+            return F.softmax(logits, dim=-1)
+
+    def register_classification_head(self, name, num_classes=None, inner_dim=None, **kwargs):
+        """Register a classification head."""
+        if name in self.classification_heads:
+            prev_num_classes = self.classification_heads[name].out_proj.out_features
+            prev_inner_dim = self.classification_heads[name].dense.out_features
+            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
+                logger.warning(
+                    're-registering head "{}" with num_classes {} (prev: {}) '
+                    "and inner_dim {} (prev: {})".format(
+                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
+                    )
+                )
+        self.classification_heads[name] = RobertaClassificationHead(
+            input_dim=self.cfg.transformer.encoder.embed_dim,
+            inner_dim=inner_dim or self.cfg.transformer.encoder.embed_dim,
+            num_classes=num_classes,
+            activation_fn="tanh",
+            pooler_dropout=0,
+        )
+
+    @property
+    def supported_targets(self):
+        return {"self"}
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+
+        # rename decoder -> encoder before upgrading children modules
+        for k in list(state_dict.keys()):
+            if k.startswith(prefix + "decoder"):
+                new_k = prefix + "encoder" + k[len(prefix + "decoder") :]
+                state_dict[new_k] = state_dict[k]
+                del state_dict[k]
+
+        # rename emb_layer_norm -> layernorm_embedding
+        for k in list(state_dict.keys()):
+            if ".emb_layer_norm." in k:
+                new_k = k.replace(".emb_layer_norm.", ".layernorm_embedding.")
+                state_dict[new_k] = state_dict[k]
+                del state_dict[k]
+
+            if self.encoder.regression_head is not None:
+                if ".lm_head." in k:
+                    new_k = k.replace(".lm_head.", ".regression_head.")
+                    state_dict[new_k] = state_dict[k]
+                    del state_dict[k]
+            else:
+                if ".regression_head." in k:
+                    del state_dict[k]
+
+        # upgrade children modules
+        super().upgrade_state_dict_named(state_dict, name)
+
+        # Handle new classification heads present in the state dict.
+        current_head_names = (
+            []
+            if not hasattr(self, "classification_heads") or self.classification_heads is None
+            else self.classification_heads.keys()
+        )
+        keys_to_delete = []
+        for k in state_dict.keys():
+            if not k.startswith(prefix + "classification_heads."):
+                continue
+
+            head_name = k[len(prefix + "classification_heads.") :].split(".")[0]
+            num_classes = state_dict[prefix + "classification_heads." + head_name + ".out_proj.weight"].size(0)
+            inner_dim = state_dict[prefix + "classification_heads." + head_name + ".dense.weight"].size(0)
+
+            if self.cfg.load_checkpoint_heads:
+                if head_name not in current_head_names:
+                    self.register_classification_head(head_name, num_classes, inner_dim)
+            else:
+                if head_name not in current_head_names:
+                    logger.warning(
+                        "deleting classification head ({}) from checkpoint "
+                        "not present in current model: {}".format(head_name, k)
+                    )
+                    keys_to_delete.append(k)
+                elif (
+                    num_classes != self.classification_heads[head_name].out_proj.out_features
+                    or inner_dim != self.classification_heads[head_name].dense.out_features
+                ):
+                    logger.warning(
+                        "deleting classification head ({}) from checkpoint "
+                        "with different dimensions than current model: {}".format(head_name, k)
+                    )
+                    keys_to_delete.append(k)
+        for k in keys_to_delete:
+            del state_dict[k]
+
+        # Copy any newly-added classification heads into the state dict
+        # with their current weights.
+        if (
+            hasattr(self, "classification_heads")
+            and self.classification_heads is not None
+            and len(self.classification_heads) > 0
+        ):
+            cur_state = self.classification_heads.state_dict()
+            for k, v in cur_state.items():
+                if prefix + "classification_heads." + k not in state_dict:
+                    logger.info("Overwriting " + prefix + "classification_heads." + k)
+                    state_dict[prefix + "classification_heads." + k] = v
+
+            for k in list(state_dict.keys()):
+                if k.startswith(prefix + "encoder.lm_head.") or k.startswith(prefix + "encoder.emb_head."):
+                    del state_dict[k]
+
+            self.encoder.lm_head = None
+
+        if self.encoder.target_model is None:
+            for k in list(state_dict.keys()):
+                if k.startswith(prefix + "encoder.target_model."):
+                    del state_dict[k]
+
+        if (self.encoder.ema is None) and (prefix + "encoder._ema" in state_dict):
+            del state_dict[prefix + "encoder._ema"]
+
+    def remove_pretraining_modules(self, last_layer=None):
+        self.encoder.lm_head = None
+        self.encoder.regression_head = None
+        self.encoder.ema = None
+        self.classification_heads = None
+
+        if last_layer is not None:
+            self.encoder.sentence_encoder.layers = nn.ModuleList(
+                l for i, l in enumerate(self.encoder.sentence_encoder.layers) if i <= last_layer
+            )
+            self.encoder.sentence_encoder.layer_norm = None
+
+
+class Data2VecTextEncoder(FairseqEncoder):
+    def __init__(self, cfg: Data2VecTextConfig, dictionary, task_data):
+        super().__init__(dictionary)
+
+        self.cfg = cfg
+
+        embed_tokens = self.build_embedding(len(dictionary), cfg.transformer.encoder.embed_dim, dictionary.pad())
+
+        self.sentence_encoder = self.build_encoder(cfg, dictionary, embed_tokens)
+        self.mask_idx = dictionary.index("<mask>")
+        assert self.mask_idx != dictionary.unk(), dictionary.symbols
+
+        self.ema = None
+        self.average_top_k_layers = cfg.average_top_k_layers
+        self.loss_scale = cfg.loss_scale
+
+        assert self.cfg.head_layers >= 1
+
+        embed_dim = cfg.transformer.encoder.embed_dim
+        curr_dim = embed_dim
+        projs = []
+        for i in range(self.cfg.head_layers - 1):
+            next_dim = embed_dim * 2 if i == 0 else curr_dim
+            projs.append(nn.Linear(curr_dim, next_dim))
+            projs.append(nn.GELU())
+            curr_dim = next_dim
+
+        projs.append(nn.Linear(curr_dim, embed_dim))
+        self.regression_head = nn.Sequential(*projs)
+
+        self.num_updates = 0
+
+    def build_embedding(self, vocab_size, embedding_dim, padding_idx):
+        return nn.Embedding(vocab_size, embedding_dim, padding_idx)
+
+    def build_encoder(self, cfg, dictionary, embed_tokens):
+        encoder = TransformerEncoder(cfg.transformer, dictionary, embed_tokens, return_fc=True)
+        encoder.apply(init_bert_params)
+        return encoder
+
+    def build_lm_head(self, embed_dim, output_dim, activation_fn, weight):
+        return RobertaLMHead(embed_dim, output_dim, activation_fn, weight)
+
+    def make_ema_teacher(self):
+        ema_config = EMAConfig(
+            ema_decay=self.cfg.ema_decay,
+            ema_fp32=True,
+        )
+        skip_keys = set()
+        if self.cfg.ema_transformer_layers_only:
+            for k, _ in self.sentence_encoder.embed_positions.named_parameters():
+                skip_keys.add(f"embed_tokens.{k}")
+            for k, _ in self.sentence_encoder.embed_positions.named_parameters():
+                skip_keys.add(f"embed_positions.{k}")
+            if self.sentence_encoder.layernorm_embedding is not None:
+                for (
+                    k,
+                    _,
+                ) in self.sentence_encoder.layernorm_embedding.named_parameters():
+                    skip_keys.add(f"layernorm_embedding.{k}")
+            if self.sentence_encoder.layer_norm is not None:
+                for k, _ in self.sentence_encoder.layer_norm.named_parameters():
+                    skip_keys.add(f"layernorm_embedding.{k}")
+
+        self.ema = EMA(
+            self.sentence_encoder,
+            ema_config,
+            skip_keys=skip_keys,
+        )
+
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+
+        if self.ema is None and self.regression_head is not None:
+            logger.info("making ema teacher")
+            self.make_ema_teacher()
+        elif self.training and self.ema is not None:
+            if self.cfg.ema_decay != self.cfg.ema_end_decay:
+                if num_updates >= self.cfg.ema_anneal_end_step:
+                    decay = self.cfg.ema_end_decay
+                else:
+                    decay = get_annealed_rate(
+                        self.cfg.ema_decay,
+                        self.cfg.ema_end_decay,
+                        num_updates,
+                        self.cfg.ema_anneal_end_step,
+                    )
+                self.ema._set_decay(decay)
+            if self.ema.get_decay() < 1:
+                self.ema.step(self.sentence_encoder)
+
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        state = super().state_dict(destination, prefix, keep_vars)
+        if self.ema is not None:
+            state[prefix + "_ema"] = self.ema.fp32_params
+        return state
+
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        if self.ema is not None:
+            k = prefix + "_ema"
+            assert k in state_dict
+            self.ema.restore(state_dict[k], True)
+            del state_dict[k]
+        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+    def forward(
+        self,
+        src_tokens,
+        target_tokens=None,
+        features_only=False,
+        return_all_hiddens=False,
+        masked_tokens=None,
+        **unused,
+    ):
+        """
+        Args:
+            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
+            features_only (bool, optional): skip LM head and just return
+                features. If True, the output will be of shape `(batch, src_len, embed_dim)`.
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+
+        Returns:
+            tuple:
+                - the LM output of shape `(batch, src_len, vocab)`
+                - a dictionary of additional data, where 'inner_states' is a list of hidden states. Note that the
+                  hidden states have shape `(src_len, batch, vocab)`.
+        """
+
+        x, extra = self.extract_features(src_tokens, return_all_hiddens=return_all_hiddens)
+
+        if features_only:
+            return x, extra
+
+        assert target_tokens is not None
+
+        with torch.no_grad():
+            # use EMA parameter as the teacher
+            self.ema.model.eval()
+
+            encoder_out = self.ema.model(
+                target_tokens,
+                return_all_hiddens=True,
+            )
+            y = encoder_out["fc_results"]
+
+            y = y[-self.average_top_k_layers :]
+
+            permuted = False
+            if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer:
+                y = [tl.permute(1, 2, 0) for tl in y]  # TBC -> BCT
+                permuted = True
+
+            if self.cfg.batch_norm_target_layer:
+                y = [F.batch_norm(tl.float(), running_mean=None, running_var=None, training=True) for tl in y]
+
+            if self.cfg.instance_norm_target_layer:
+                y = [F.instance_norm(tl.float()) for tl in y]
+
+            if permuted:
+                y = [tl.transpose(1, 2) for tl in y]  # BCT -> BTC
+
+            if self.cfg.layer_norm_target_layer:
+                y = [F.layer_norm(tl.float(), tl.shape[-1:]) for tl in y]
+
+            y = sum(y) / len(y)
+
+            if not permuted:
+                y = y.transpose(0, 1)
+
+            if self.cfg.layer_norm_targets:
+                y = F.layer_norm(y.float(), y.shape[-1:])
+
+            if self.cfg.instance_norm_targets:
+                y = F.instance_norm(y.transpose(1, 2)).transpose(1, 2)
+
+        masked_indices = src_tokens.eq(self.mask_idx)
+
+        x = x[masked_indices]
+        y = y[masked_indices]
+
+        x = self.regression_head(x)
+
+        sz = x.size(-1)
+        if self.cfg.loss_beta == 0:
+            loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1)
+        else:
+            loss = F.smooth_l1_loss(x.float(), y.float(), reduction="none", beta=self.cfg.loss_beta).sum(dim=-1)
+
+        result = {
+            "losses": {
+                "main": loss.sum() / math.sqrt(sz) if self.loss_scale <= 0 else loss.sum() * self.loss_scale,
+            },
+            "sample_size": loss.numel(),
+        }
+
+        # logging other values
+        other_logs = {"ema_decay": self.ema.get_decay() * 1000}
+        result["logs"] = other_logs
+        return result
+
+    def extract_features(self, src_tokens, return_all_hiddens=False, **kwargs):
+        encoder_out = self.sentence_encoder(
+            src_tokens,
+            return_all_hiddens=return_all_hiddens,
+            token_embeddings=kwargs.get("token_embeddings", None),
+        )
+        # T x B x C -> B x T x C
+        features = encoder_out["encoder_out"][0].transpose(0, 1)
+        inner_states = encoder_out["encoder_states"] if return_all_hiddens else None
+        return features, {
+            "inner_states": inner_states,
+            "encoder_embedding": encoder_out["encoder_embedding"][0],
+        }
+
+    def output_layer(self, features, masked_tokens=None, **unused):
+        return self.lm_head(features, masked_tokens)
+
+    def max_positions(self):
+        """Maximum output length supported by the encoder."""
+        return self.cfg.max_positions

From 7c3ec90443797d4ecad623aa1f0b2e3e9c700e49 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 16:03:19 -0500
Subject: [PATCH 012/121] Update model name in docs/source/index.mdx to be
 CamelCase

---
 docs/source/index.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 8219f3b89e006..8773639dbbde0 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -177,7 +177,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          data2vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Data2Vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |

From 2a652c4887a9e7d3b0d579fdad46938bad3c90d1 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 16:12:16 -0500
Subject: [PATCH 013/121] Revert model name in table to lower-case to get
 check_table test to pass

---
 docs/source/index.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 8773639dbbde0..8219f3b89e006 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -177,7 +177,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          Data2Vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          data2vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |

From 65219fb71e9bef360e3dcfb4aff4cd612de00c7f Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:09:53 -0800
Subject: [PATCH 014/121] Update src/transformers/models/data2vec/__init__.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/data2vec/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
index 63b7945d90f38..312a1bf2db7f0 100644
--- a/src/transformers/models/data2vec/__init__.py
+++ b/src/transformers/models/data2vec/__init__.py
@@ -2,7 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.
 
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 432fef53651d6f3c128789e29879d38ce88b3408 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:10:07 -0800
Subject: [PATCH 015/121] Update
 src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../convert_data2vec_original_pytorch_checkpoint_to_pytorch.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
index 973af5144bcb6..58d09a9d2b09e 100644
--- a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
+# Copyright 2022 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 6aa237dc9a765f6fa2b7bd5a291f354aa1de0615 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:10:17 -0800
Subject: [PATCH 016/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 172958898535b..336e47c0dbbaa 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright 2022 The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

From c82c7b2d336e71317e04296407a70097bbc72552 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:10:29 -0800
Subject: [PATCH 017/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 336e47c0dbbaa..405ba59e2d5b2 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -1,6 +1,5 @@
 # coding=utf-8
 # Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 96c0899fa37f48a1d67e11540f7dc0f9cd2d3180 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:11:04 -0800
Subject: [PATCH 018/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 405ba59e2d5b2..0ca4f514da055 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -67,7 +67,6 @@ class Data2VecEmbeddings(nn.Module):
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
 
-    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
     def __init__(self, config):
         super().__init__()
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)

From dd40020e9a6853d8f33c41860b8174bc9ab209ab Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:11:09 -0800
Subject: [PATCH 019/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 0ca4f514da055..a627ed82dacc7 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -87,7 +87,6 @@ def __init__(self, config):
                 persistent=False,
             )
 
-        # End copy
         self.padding_idx = config.pad_token_id
         self.position_embeddings = nn.Embedding(
             config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx

From 57059513920d4ce52057bde5f693549815537c13 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:12:38 -0800
Subject: [PATCH 020/121] Update docs/source/model_doc/data2vec.mdx

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/model_doc/data2vec.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
index a9fc0caff33f0..c994c3b9daa83 100644
--- a/docs/source/model_doc/data2vec.mdx
+++ b/docs/source/model_doc/data2vec.mdx
@@ -1,4 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

From aa38abfe661c35049a86a81869ebe3c486981b89 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:12:51 -0800
Subject: [PATCH 021/121] Update docs/source/model_doc/data2vec.mdx

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/model_doc/data2vec.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
index c994c3b9daa83..2a58800081adb 100644
--- a/docs/source/model_doc/data2vec.mdx
+++ b/docs/source/model_doc/data2vec.mdx
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# data2vec
+# Data2Vec
 
 ## Overview
 

From d0936e874346f7a36e579d8a76d9237cc2764912 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:13:14 -0800
Subject: [PATCH 022/121] Update
 src/transformers/models/auto/configuration_auto.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/auto/configuration_auto.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 4acb20c3326f3..c052d4c08fc40 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -263,7 +263,7 @@
         ("reformer", "Reformer"),
         ("longformer", "Longformer"),
         ("roberta", "RoBERTa"),
-        ("data2vec", "data2vec"),
+        ("data2vec", "Data2Vec"),
         ("flaubert", "FlauBERT"),
         ("fsmt", "FairSeq Machine-Translation"),
         ("squeezebert", "SqueezeBERT"),

From 3a058d706a46d4e4e9915d7b0bb6dc20af8cfb30 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:13:22 -0800
Subject: [PATCH 023/121] Update
 src/transformers/models/data2vec/configuration_data2vec.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/data2vec/configuration_data2vec.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index f4929e2854d16..b24a27a73538e 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -1,6 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 6d73033ebbe71ad2a7014dd6b6fb572636166572 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:13:30 -0800
Subject: [PATCH 024/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index a627ed82dacc7..33e5b067c6786 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch data2vec model."""
+"""PyTorch Data2Vec model."""
 
 import math
 

From a555792cf20b6f8adf0a8378663c8e6ff68b972b Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:13:42 -0800
Subject: [PATCH 025/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 33e5b067c6786..39c85fa11a107 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -683,7 +683,7 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
 
 
 @add_start_docstrings(
-    "The bare data2vec Model transformer outputting raw hidden-states without any specific head on top.",
+    "The bare Data2Vec Model transformer outputting raw hidden-states without any specific head on top.",
     DATA2VEC_START_DOCSTRING,
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaModel with ROBERTA->DATA2VEC,Roberta->Data2Vec

From 5b7dff1522c83000372cde05242470941690ee39 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:14:05 -0800
Subject: [PATCH 026/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 39c85fa11a107..2b5bd2c043434 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -1152,7 +1152,7 @@ def _tie_weights(self):
 
 @add_start_docstrings(
     """
-    data2vec Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    Data2Vec Model transformer with a sequence classification/regression head on top (a linear layer on top of the
     pooled output) e.g. for GLUE tasks.
     """,
     DATA2VEC_START_DOCSTRING,

From 492f510dbfee7b78499328e67835e19a754f8ee2 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:14:11 -0800
Subject: [PATCH 027/121] Update tests/test_modeling_data2vec.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 tests/test_modeling_data2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_modeling_data2vec.py b/tests/test_modeling_data2vec.py
index 69a759b4c6b11..4a59ba45c3a27 100644
--- a/tests/test_modeling_data2vec.py
+++ b/tests/test_modeling_data2vec.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 5e96b7042e20c383b555dce55959b9c10a4ef7cf Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:15:24 -0800
Subject: [PATCH 028/121] Update
 src/transformers/models/data2vec/configuration_data2vec.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/data2vec/configuration_data2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index b24a27a73538e..706ebb8f61099 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" data2vec configuration"""
+""" Data2Vec configuration"""
 from collections import OrderedDict
 from typing import Mapping
 

From 34d045c9551816ae0963ae8c0b7a63c20904e043 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:15:34 -0800
Subject: [PATCH 029/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 2b5bd2c043434..5956cac074afa 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -871,7 +871,7 @@ def forward(
 
 
 @add_start_docstrings(
-    """data2vec Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VEC_START_DOCSTRING
+    """Data2Vec Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VEC_START_DOCSTRING
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec,roberta-base->data2vec
 class Data2VecForCausalLM(Data2VecPreTrainedModel):

From d239d63f1eec582ab15579d95a271eed37020559 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Mon, 7 Feb 2022 20:21:16 -0500
Subject: [PATCH 030/121] Update documentation

---
 docs/source/model_doc/data2vec.mdx | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
index a9fc0caff33f0..21ced12bd836f 100644
--- a/docs/source/model_doc/data2vec.mdx
+++ b/docs/source/model_doc/data2vec.mdx
@@ -1,4 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -10,23 +10,34 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# data2vec
+# Data2Vec
 
 ## Overview
 
-The data2vec model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
+The Data2Vec model was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://scontent-sjc3-1.xx.fbcdn.net/v/t39.8562-6/271974914_483120576492438_4239522333319653600_n.pdf?_nc_cat=107&ccb=1-5&_nc_sid=ae5e01&_nc_ohc=7huShTb_QZIAX-N7SYx&_nc_ht=scontent-sjc3-1.xx&oh=00_AT_lXXL69mjqmdVWbaLh4Ro6DY17aFeO5vA9I-mIpyNieg&oe=6205C411) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and Michael Auli.
+Data2Vec proposes a unified framework for self-supervised learning across different data modalities - text, audio and images.
+Importantly, predicted targets for pre-training are contextualized latent representations of the inputs, rather than modality-specific, context-independent targets.
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+*While the general idea of self-supervised learning is identical across modalities, the actual algorithms and
+objectives differ widely because they were developed with a single modality in mind. To get us closer to general
+self-supervised learning, we present data2vec, a framework that uses the same learning method for either speech,
+NLP or computer vision. The core idea is to predict latent representations of the full input data based on a
+masked view of the input in a selfdistillation setup using a standard Transformer architecture.
+Instead of predicting modality-specific targets such as words, visual tokens or units of human speech which
+are local in nature, data2vec predicts contextualized latent representations that contain information from
+the entire input. Experiments on the major benchmarks of speech recognition, image classification, and
+natural language understanding demonstrate a new state of the art or competitive performance to predominant approaches.
+Models and code are available at www.github.com/pytorch/fairseq/tree/master/examples/data2vec.*
 
 Tips:
 
-<INSERT TIPS ABOUT MODEL HERE>
+- This implementation has a shared encoder for all different modalities and different pre-processors for each modality.
+  For example, in the case of text, preprocessing is identical to [`RobertaModel`], including tokenization.
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+This model was contributed by [edugp](https://huggingface.co/edugp).
+The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/data2vec).
 
 
 ## Data2VecConfig

From 3e6cd537a82c50a1331d3fc95ffdc515bfd382ac Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Tue, 8 Feb 2022 20:56:16 -0500
Subject: [PATCH 031/121] Copy-paste Data2VecConfig from BertConfig

---
 .../models/data2vec/configuration_data2vec.py | 99 ++++++++++++++++---
 1 file changed, 86 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index 706ebb8f61099..c058a37a634bd 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -16,9 +16,9 @@
 from collections import OrderedDict
 from typing import Mapping
 
+from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
 from ...utils import logging
-from ..bert.configuration_bert import BertConfig
 
 
 logger = logging.get_logger(__name__)
@@ -28,27 +28,66 @@
 }
 
 
-class Data2VecConfig(BertConfig):
+class Data2VecConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Data2VecModel`] or a [`TFData2VecModel`]. It is
-    used to instantiate a data2vec model according to the specified arguments, defining the model architecture.
-
+    This is the configuration class to store the configuration of a [`Data2VecModel`], identical to [`BertConfig].
+    It is used to instantiate a Data2Vec model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2Vec
+    [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
-    The [`Data2VecConfig`] class directly inherits [`BertConfig`]. It reuses the same defaults. Please check the parent
-    class for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
 
     Examples:
 
     ```python
-    >>> from transformers import Data2VecConfig, Data2VecModel
+    >>> from transformers import Data2VecModel, Data2VecConfig
 
-    >>> # Initializing a data2vec configuration
+    >>> # Initializing a Data2Vec edugp/data2vec-nlp-base style configuration
     >>> configuration = Data2VecConfig()
 
-    >>> # Initializing a model from the configuration
+    >>> # Initializing a model from the edugp/data2vec-nlp-base style configuration
     >>> model = Data2VecModel(configuration)
 
     >>> # Accessing the model configuration
@@ -56,9 +95,43 @@ class for more information.
     ```"""
     model_type = "data2vec"
 
-    def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
-        """Constructs Data2VecConfig."""
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
 
 
 class Data2VecOnnxConfig(OnnxConfig):

From 91c03bdda30fc2f968e7a634e74f15154ba203c8 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Tue, 8 Feb 2022 21:17:35 -0500
Subject: [PATCH 032/121] Update config checkpoint to point to
 edugp/data2vec-nlp-base. Fix style and repo-consistency

---
 docs/source/index.mdx                                      | 2 +-
 docs/source/serialization.mdx                              | 2 +-
 src/transformers/models/data2vec/configuration_data2vec.py | 6 +++---
 src/transformers/models/data2vec/modeling_data2vec.py      | 2 ++
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 8219f3b89e006..8773639dbbde0 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -177,7 +177,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          data2vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Data2Vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/serialization.mdx b/docs/source/serialization.mdx
index c4dc49e8c9610..e235536bc86ac 100644
--- a/docs/source/serialization.mdx
+++ b/docs/source/serialization.mdx
@@ -49,7 +49,7 @@ Ready-made configurations include the following architectures:
 - BART
 - BERT
 - CamemBERT
-- data2vec
+- Data2Vec
 - DistilBERT
 - GPT Neo
 - I-BERT
diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index c058a37a634bd..8dcd44372c313 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -24,14 +24,14 @@
 logger = logging.get_logger(__name__)
 
 DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "data2vec": "https://huggingface.co/data2vec/resolve/main/config.json",
+    "edugp/data2vec-nlp-base": "https://huggingface.co/data2vec/resolve/main/config.json",
 }
 
 
 class Data2VecConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Data2VecModel`], identical to [`BertConfig].
-    It is used to instantiate a Data2Vec model according to the specified arguments, defining the model architecture.
+    This is the configuration class to store the configuration of a [`Data2VecModel`], identical to [`BertConfig]. It
+    is used to instantiate a Data2Vec model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2Vec
     [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
 
diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 5956cac074afa..37c7354d9432a 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -67,6 +67,7 @@ class Data2VecEmbeddings(nn.Module):
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
 
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
     def __init__(self, config):
         super().__init__()
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
@@ -87,6 +88,7 @@ def __init__(self, config):
                 persistent=False,
             )
 
+        # End copy
         self.padding_idx = config.pad_token_id
         self.position_embeddings = nn.Embedding(
             config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx

From 7fa2e2a07c5d13fe24f20c2133da6adf76b77719 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Tue, 8 Feb 2022 21:40:24 -0500
Subject: [PATCH 033/121] Update config special tokens to match RoBERTa

---
 src/transformers/models/data2vec/configuration_data2vec.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index 8dcd44372c313..ae56227dd921b 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -94,7 +94,6 @@ class Data2VecConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
     model_type = "data2vec"
-
     def __init__(
         self,
         vocab_size=30522,
@@ -109,13 +108,15 @@ def __init__(
         type_vocab_size=2,
         initializer_range=0.02,
         layer_norm_eps=1e-12,
-        pad_token_id=0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
         **kwargs
     ):
-        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size

From 988bbf0bae4c3af625c0f91dff3b71b9b957072c Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Tue, 8 Feb 2022 21:41:25 -0500
Subject: [PATCH 034/121] Split multiple assertions and add individual error
 messages

---
 ..._original_pytorch_checkpoint_to_pytorch.py | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
index 58d09a9d2b09e..4173bc3085c5e 100644
--- a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
@@ -95,12 +95,15 @@ def convert_data2vec_checkpoint_to_pytorch(
 
         # self attention
         self_attn: BertSelfAttention = layer.attention.self
-        assert (
-            data2vec_layer.self_attn.k_proj.weight.data.shape
-            == data2vec_layer.self_attn.q_proj.weight.data.shape
-            == data2vec_layer.self_attn.v_proj.weight.data.shape
-            == torch.Size((config.hidden_size, config.hidden_size))
-        )
+        assert data2vec_layer.self_attn.k_proj.weight.data.shape == torch.Size(
+            (config.hidden_size, config.hidden_size)
+        ), f"Shape for data2vec_layer.self_attn.k_proj.weight.data should be {torch.Size((config.hidden_size, config.hidden_size))}"
+        assert data2vec_layer.self_attn.q_proj.weight.data.shape == torch.Size(
+            (config.hidden_size, config.hidden_size)
+        ), f"Shape for data2vec_layer.self_attn.q_proj.weight.data should be {torch.Size((config.hidden_size, config.hidden_size))}"
+        assert data2vec_layer.self_attn.v_proj.weight.data.shape == torch.Size(
+            (config.hidden_size, config.hidden_size)
+        ), f"Shape for data2vec_layer.self_attn.v_proj.weight.data should be {torch.Size((config.hidden_size, config.hidden_size))}"
 
         self_attn.query.weight.data = data2vec_layer.self_attn.q_proj.weight
         self_attn.query.bias.data = data2vec_layer.self_attn.q_proj.bias
@@ -111,7 +114,9 @@ def convert_data2vec_checkpoint_to_pytorch(
 
         # self-attention output
         self_output: BertSelfOutput = layer.attention.output
-        assert self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape
+        assert (
+            self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape
+        ), f"Shape for self_output.dense.weight should be {data2vec_layer.self_attn.out_proj.weight.shape}"
         self_output.dense.weight = data2vec_layer.self_attn.out_proj.weight
         self_output.dense.bias = data2vec_layer.self_attn.out_proj.bias
         self_output.LayerNorm.weight = data2vec_layer.self_attn_layer_norm.weight
@@ -119,13 +124,17 @@ def convert_data2vec_checkpoint_to_pytorch(
 
         # intermediate
         intermediate: BertIntermediate = layer.intermediate
-        assert intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape
+        assert (
+            intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape
+        ), f"Shape for intermediate.dense.weight should be {data2vec_layer.fc1.weight.shape}"
         intermediate.dense.weight = data2vec_layer.fc1.weight
         intermediate.dense.bias = data2vec_layer.fc1.bias
 
         # output
         bert_output: BertOutput = layer.output
-        assert bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape
+        assert (
+            bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape
+        ), f"Shape for bert_output.dense.weight should be {data2vec_layer.fc2.weight.shape}"
         bert_output.dense.weight = data2vec_layer.fc2.weight
         bert_output.dense.bias = data2vec_layer.fc2.bias
         bert_output.LayerNorm.weight = data2vec_layer.final_layer_norm.weight

From b796850406bba62a94d6b2940061fdc490e8e000 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Wed, 9 Feb 2022 00:38:25 -0500
Subject: [PATCH 035/121] Rename Data2VecModel to Data2VecForTextModel

---
 docs/source/model_doc/data2vec.mdx            |  4 +--
 src/transformers/__init__.py                  |  4 +--
 src/transformers/models/auto/modeling_auto.py |  2 +-
 src/transformers/models/data2vec/__init__.py  |  4 +--
 .../models/data2vec/configuration_data2vec.py | 13 +++----
 .../models/data2vec/modeling_data2vec.py      | 23 +++++-------
 src/transformers/utils/dummy_pt_objects.py    |  4 +--
 tests/test_modeling_data2vec.py               | 35 ++++---------------
 8 files changed, 30 insertions(+), 59 deletions(-)

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
index 21ced12bd836f..619c186edfbd9 100644
--- a/docs/source/model_doc/data2vec.mdx
+++ b/docs/source/model_doc/data2vec.mdx
@@ -44,9 +44,9 @@ The original code can be found [here](https://github.com/pytorch/fairseq/tree/ma
 
 [[autodoc]] Data2VecConfig
 
-## Data2VecModel
+## Data2VecForTextModel
 
-[[autodoc]] Data2VecModel
+[[autodoc]] Data2VecForTextModel
     - forward
 
 ## Data2VecForCausalLM
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ebc184f567b58..44f7a1b4db65c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -855,8 +855,8 @@
             "Data2VecForMultipleChoice",
             "Data2VecForQuestionAnswering",
             "Data2VecForSequenceClassification",
+            "Data2VecForTextModel",
             "Data2VecForTokenClassification",
-            "Data2VecModel",
             "Data2VecPreTrainedModel",
         ]
     )
@@ -2950,8 +2950,8 @@
             Data2VecForMultipleChoice,
             Data2VecForQuestionAnswering,
             Data2VecForSequenceClassification,
+            Data2VecForTextModel,
             Data2VecForTokenClassification,
-            Data2VecModel,
             Data2VecPreTrainedModel,
         )
         from .models.deberta import (
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index be5e3987a7b89..82b7ab2ac0a83 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -80,7 +80,7 @@
         ("bart", "BartModel"),
         ("longformer", "LongformerModel"),
         ("roberta", "RobertaModel"),
-        ("data2vec", "Data2VecModel"),
+        ("data2vec", "Data2VecForTextModel"),
         ("layoutlm", "LayoutLMModel"),
         ("squeezebert", "SqueezeBertModel"),
         ("bert", "BertModel"),
diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
index 312a1bf2db7f0..62a37ff78ba9f 100644
--- a/src/transformers/models/data2vec/__init__.py
+++ b/src/transformers/models/data2vec/__init__.py
@@ -33,8 +33,8 @@
         "Data2VecForMultipleChoice",
         "Data2VecForQuestionAnswering",
         "Data2VecForSequenceClassification",
+        "Data2VecForTextModel",
         "Data2VecForTokenClassification",
-        "Data2VecModel",
         "Data2VecPreTrainedModel",
     ]
 
@@ -49,8 +49,8 @@
             Data2VecForMultipleChoice,
             Data2VecForQuestionAnswering,
             Data2VecForSequenceClassification,
+            Data2VecForTextModel,
             Data2VecForTokenClassification,
-            Data2VecModel,
             Data2VecPreTrainedModel,
         )
 
diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index ae56227dd921b..32cb944aa243d 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -30,10 +30,10 @@
 
 class Data2VecConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Data2VecModel`], identical to [`BertConfig]. It
-    is used to instantiate a Data2Vec model according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2Vec
-    [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
+    This is the configuration class to store the configuration of a [`Data2VecForTextModel`], identical to
+    [`BertConfig]. It is used to instantiate a Data2Vec model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    Data2Vec [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -82,18 +82,19 @@ class Data2VecConfig(PretrainedConfig):
     Examples:
 
     ```python
-    >>> from transformers import Data2VecModel, Data2VecConfig
+    >>> from transformers import Data2VecForTextModel, Data2VecConfig
 
     >>> # Initializing a Data2Vec edugp/data2vec-nlp-base style configuration
     >>> configuration = Data2VecConfig()
 
     >>> # Initializing a model from the edugp/data2vec-nlp-base style configuration
-    >>> model = Data2VecModel(configuration)
+    >>> model = Data2VecForTextModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
     model_type = "data2vec"
+
     def __init__(
         self,
         vocab_size=30522,
diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 37c7354d9432a..d8425fea00236 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -152,7 +152,6 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
         return position_ids.unsqueeze(0).expand(input_shape)
 
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Data2Vec
 class Data2VecSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
@@ -253,7 +252,7 @@ def forward(
 
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in Data2VecModel forward() function)
+            # Apply the attention mask is (precomputed for all layers in Data2VecForTextModel forward() function)
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
@@ -689,7 +688,7 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
     DATA2VEC_START_DOCSTRING,
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaModel with ROBERTA->DATA2VEC,Roberta->Data2Vec
-class Data2VecModel(Data2VecPreTrainedModel):
+class Data2VecForTextModel(Data2VecPreTrainedModel):
     """
 
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
@@ -875,7 +874,6 @@ def forward(
 @add_start_docstrings(
     """Data2Vec Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VEC_START_DOCSTRING
 )
-# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec,roberta-base->data2vec
 class Data2VecForCausalLM(Data2VecPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
@@ -887,7 +885,7 @@ def __init__(self, config):
         if not config.is_decoder:
             logger.warning("If you want to use `Data2VecLMHeadModel` as a standalone, add `is_decoder=True.`")
 
-        self.data2vec = Data2VecModel(config, add_pooling_layer=False)
+        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
         self.lm_head = Data2VecLMHead(config)
 
         # The LM head weights require special treatment only when they are tied with the word embeddings
@@ -1028,7 +1026,6 @@ def _reorder_cache(self, past, beam_idx):
 
 
 @add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VEC_START_DOCSTRING)
-# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec
 class Data2VecForMaskedLM(Data2VecPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
@@ -1043,7 +1040,7 @@ def __init__(self, config):
                 "bi-directional self-attention."
             )
 
-        self.data2vec = Data2VecModel(config, add_pooling_layer=False)
+        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
         self.lm_head = Data2VecLMHead(config)
 
         # The LM head weights require special treatment only when they are tied with the word embeddings
@@ -1159,7 +1156,6 @@ def _tie_weights(self):
     """,
     DATA2VEC_START_DOCSTRING,
 )
-# Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec
 class Data2VecForSequenceClassification(Data2VecPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
@@ -1168,7 +1164,7 @@ def __init__(self, config):
         self.num_labels = config.num_labels
         self.config = config
 
-        self.data2vec = Data2VecModel(config, add_pooling_layer=False)
+        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
         self.classifier = Data2VecClassificationHead(config)
 
         # Initialize weights and apply final processing
@@ -1258,14 +1254,13 @@ def forward(
     """,
     DATA2VEC_START_DOCSTRING,
 )
-# Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec
 class Data2VecForMultipleChoice(Data2VecPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def __init__(self, config):
         super().__init__(config)
 
-        self.data2vec = Data2VecModel(config)
+        self.data2vec = Data2VecForTextModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, 1)
 
@@ -1354,7 +1349,6 @@ def forward(
     """,
     DATA2VEC_START_DOCSTRING,
 )
-# Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec
 class Data2VecForTokenClassification(Data2VecPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids"]
@@ -1363,7 +1357,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.data2vec = Data2VecModel(config, add_pooling_layer=False)
+        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
         classifier_dropout = (
             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
         )
@@ -1463,7 +1457,6 @@ def forward(self, features, **kwargs):
     """,
     DATA2VEC_START_DOCSTRING,
 )
-# Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec
 class Data2VecForQuestionAnswering(Data2VecPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids"]
@@ -1472,7 +1465,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.data2vec = Data2VecModel(config, add_pooling_layer=False)
+        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
         # Initialize weights and apply final processing
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 129f57a297d9f..197874c4c09e8 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1177,14 +1177,14 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForTokenClassification(metaclass=DummyObject):
+class Data2VecForTextModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecModel(metaclass=DummyObject):
+class Data2VecForTokenClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/test_modeling_data2vec.py b/tests/test_modeling_data2vec.py
index 4a59ba45c3a27..3ff5f25bddcbc 100644
--- a/tests/test_modeling_data2vec.py
+++ b/tests/test_modeling_data2vec.py
@@ -15,7 +15,6 @@
 
 
 import unittest
-from copy import deepcopy
 
 from transformers import Data2VecConfig, is_torch_available
 from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
@@ -34,8 +33,8 @@
         Data2VecForMultipleChoice,
         Data2VecForQuestionAnswering,
         Data2VecForSequenceClassification,
+        Data2VecForTextModel,
         Data2VecForTokenClassification,
-        Data2VecModel,
     )
     from transformers.models.data2vec.modeling_data2vec import (
         DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -43,8 +42,6 @@
         create_position_ids_from_input_ids,
     )
 
-DATA2VEC_TINY = "sshleifer/tiny-distildata2vec"
-
 
 class Data2VecModelTester:
     def __init__(
@@ -142,7 +139,7 @@ def prepare_config_and_inputs_for_decoder(self):
     def create_and_check_model(
         self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
-        model = Data2VecModel(config=config)
+        model = Data2VecForTextModel(config=config)
         model.to(torch_device)
         model.eval()
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
@@ -165,7 +162,7 @@ def create_and_check_model_as_decoder(
         encoder_attention_mask,
     ):
         config.add_cross_attention = True
-        model = Data2VecModel(config)
+        model = Data2VecForTextModel(config)
         model.to(torch_device)
         model.eval()
         result = model(
@@ -346,7 +343,7 @@ class Data2VecModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCa
         (
             Data2VecForCausalLM,
             Data2VecForMaskedLM,
-            Data2VecModel,
+            Data2VecForTextModel,
             Data2VecForSequenceClassification,
             Data2VecForTokenClassification,
             Data2VecForMultipleChoice,
@@ -433,7 +430,7 @@ def test_for_question_answering(self):
     @slow
     def test_model_from_pretrained(self):
         for model_name in DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = Data2VecModel.from_pretrained(model_name)
+            model = Data2VecForTextModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
     def test_create_position_ids_respects_padding_index(self):
@@ -502,7 +499,7 @@ def test_inference_masked_lm(self):
 
     @slow
     def test_inference_no_head(self):
-        model = Data2VecModel.from_pretrained("data2vec")
+        model = Data2VecForTextModel.from_pretrained("data2vec")
 
         input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         with torch.no_grad():
@@ -534,23 +531,3 @@ def test_inference_classification_head(self):
         # expected_tensor = data2vec.predict("mnli", input_ids, return_logits=True).detach()
 
         self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
-
-    # XXX: this might be a candidate for common tests if we have many of those
-    def test_lm_head_ignore_keys(self):
-        keys_to_ignore_on_save_tied = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-        keys_to_ignore_on_save_untied = [r"lm_head.decoder.bias"]
-        config = Data2VecConfig.from_pretrained(DATA2VEC_TINY)
-        config_tied = deepcopy(config)
-        config_tied.tie_word_embeddings = True
-        config_untied = deepcopy(config)
-        config_untied.tie_word_embeddings = False
-        for cls in [Data2VecForMaskedLM, Data2VecForCausalLM]:
-            model = cls(config_tied)
-            self.assertEqual(model._keys_to_ignore_on_save, keys_to_ignore_on_save_tied, cls)
-
-            # the keys should be different when embeddings aren't tied
-            model = cls(config_untied)
-            self.assertEqual(model._keys_to_ignore_on_save, keys_to_ignore_on_save_untied, cls)
-
-            # test that saving works with updated ignore keys - just testing that it doesn't fail
-            model.save_pretrained(self.get_auto_remove_tmp_dir())

From 0ad60a62f5da1f92bc608a6906e7ac7ec484da0d Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Wed, 9 Feb 2022 00:49:05 -0500
Subject: [PATCH 036/121] Add Data2Vec to _toctree.yml

---
 docs/source/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index f1cb9a80f31a9..c211b2dd6f894 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -158,6 +158,8 @@
       title: CPM
     - local: model_doc/ctrl
       title: CTRL
+    - local: model_doc/data2vec
+      title: Data2Vec
     - local: model_doc/deberta
       title: DeBERTa
     - local: model_doc/deberta-v2

From 913de4fa3fcea41f14da4eb575af233b109f855d Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Wed, 9 Feb 2022 16:36:04 -0500
Subject: [PATCH 037/121] Rename Data2VecEmbeddings to
 Data2VecForTextEmbeddings

---
 src/transformers/models/data2vec/modeling_data2vec.py |  6 ++----
 tests/test_modeling_data2vec.py                       | 10 +++++-----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index d8425fea00236..97aa70b804fa3 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -62,7 +62,7 @@
 
 
 # Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Data2Vec
-class Data2VecEmbeddings(nn.Module):
+class Data2VecForTextEmbeddings(nn.Module):
     """
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
@@ -687,7 +687,6 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
     "The bare Data2Vec Model transformer outputting raw hidden-states without any specific head on top.",
     DATA2VEC_START_DOCSTRING,
 )
-# Copied from transformers.models.roberta.modeling_roberta.RobertaModel with ROBERTA->DATA2VEC,Roberta->Data2Vec
 class Data2VecForTextModel(Data2VecPreTrainedModel):
     """
 
@@ -706,12 +705,11 @@ class Data2VecForTextModel(Data2VecPreTrainedModel):
 
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Data2Vec
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
 
-        self.embeddings = Data2VecEmbeddings(config)
+        self.embeddings = Data2VecForTextEmbeddings(config)
         self.encoder = Data2VecEncoder(config)
 
         self.pooler = Data2VecPooler(config) if add_pooling_layer else None
diff --git a/tests/test_modeling_data2vec.py b/tests/test_modeling_data2vec.py
index 3ff5f25bddcbc..3426ed9b86507 100644
--- a/tests/test_modeling_data2vec.py
+++ b/tests/test_modeling_data2vec.py
@@ -38,7 +38,7 @@
     )
     from transformers.models.data2vec.modeling_data2vec import (
         DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
-        Data2VecEmbeddings,
+        Data2VecForTextEmbeddings,
         create_position_ids_from_input_ids,
     )
 
@@ -438,10 +438,10 @@ def test_create_position_ids_respects_padding_index(self):
         test for https://github.com/huggingface/transformers/issues/1761
 
         The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecEmbeddings.padding_idx + 1
+        first available non-padding position index is Data2VecForTextEmbeddings.padding_idx + 1
         """
         config = self.model_tester.prepare_config_and_inputs()[0]
-        model = Data2VecEmbeddings(config=config)
+        model = Data2VecForTextEmbeddings(config=config)
 
         input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
         expected_positions = torch.as_tensor(
@@ -457,10 +457,10 @@ def test_create_position_ids_from_inputs_embeds(self):
         test for https://github.com/huggingface/transformers/issues/1761
 
         The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecEmbeddings.padding_idx + 1
+        first available non-padding position index is Data2VecForTextEmbeddings.padding_idx + 1
         """
         config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = Data2VecEmbeddings(config=config)
+        embeddings = Data2VecForTextEmbeddings(config=config)
 
         inputs_embeds = torch.empty(2, 4, 30)
         expected_single_positions = [

From 8e5902ff859e1214fc0e22c3bc94f0d2ccafd66f Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Tue, 15 Feb 2022 22:48:09 -0500
Subject: [PATCH 038/121] Add initial Data2VecForAudio model (unfinished). Only
 matching fairseq's implementation up to the feature encoder (before
 positional encoding).

---
 docs/source/model_doc/data2vec.mdx            |   5 +
 src/transformers/__init__.py                  |   2 +
 src/transformers/models/data2vec/__init__.py  |   2 +
 .../models/data2vec/configuration_data2vec.py |  45 +-
 ..._original_pytorch_checkpoint_to_pytorch.py | 246 +++++++++
 ...original_pytorch_checkpoint_to_pytorch.py} |   0
 .../models/data2vec/data2vec_audio.py         | 481 ++++++++++++++++++
 .../models/data2vec/data2vec_text.py          |   2 +
 .../models/data2vec/ema_module.py             | 133 +++++
 .../models/data2vec/modeling_data2vec.py      |  85 +++-
 src/transformers/utils/dummy_pt_objects.py    |   7 +
 utils/check_repo.py                           |   2 +
 12 files changed, 1005 insertions(+), 5 deletions(-)
 create mode 100644 src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
 rename src/transformers/models/data2vec/{convert_data2vec_original_pytorch_checkpoint_to_pytorch.py => convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py} (100%)
 create mode 100644 src/transformers/models/data2vec/data2vec_audio.py
 create mode 100644 src/transformers/models/data2vec/ema_module.py

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
index 619c186edfbd9..2471ccc883c32 100644
--- a/docs/source/model_doc/data2vec.mdx
+++ b/docs/source/model_doc/data2vec.mdx
@@ -44,6 +44,11 @@ The original code can be found [here](https://github.com/pytorch/fairseq/tree/ma
 
 [[autodoc]] Data2VecConfig
 
+## Data2VecForAudioModel
+
+[[autodoc]] Data2VecForAudioModel
+    - forward
+
 ## Data2VecForTextModel
 
 [[autodoc]] Data2VecForTextModel
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 44f7a1b4db65c..66f215a23e6f9 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -850,6 +850,7 @@
     _import_structure["models.data2vec"].extend(
         [
             "DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "Data2VecForAudioModel",
             "Data2VecForCausalLM",
             "Data2VecForMaskedLM",
             "Data2VecForMultipleChoice",
@@ -2945,6 +2946,7 @@
         )
         from .models.data2vec import (
             DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Data2VecForAudioModel,
             Data2VecForCausalLM,
             Data2VecForMaskedLM,
             Data2VecForMultipleChoice,
diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
index 62a37ff78ba9f..00873fff121d3 100644
--- a/src/transformers/models/data2vec/__init__.py
+++ b/src/transformers/models/data2vec/__init__.py
@@ -28,6 +28,7 @@
 if is_torch_available():
     _import_structure["modeling_data2vec"] = [
         "DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Data2VecForAudioModel",
         "Data2VecForCausalLM",
         "Data2VecForMaskedLM",
         "Data2VecForMultipleChoice",
@@ -44,6 +45,7 @@
     if is_torch_available():
         from .modeling_data2vec import (
             DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Data2VecForAudioModel,
             Data2VecForCausalLM,
             Data2VecForMaskedLM,
             Data2VecForMultipleChoice,
diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index 32cb944aa243d..dd2260a13cacd 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -30,10 +30,10 @@
 
 class Data2VecConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Data2VecForTextModel`], identical to
-    [`BertConfig]. It is used to instantiate a Data2Vec model according to the specified arguments, defining the model
-    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
-    Data2Vec [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
+    This is the configuration class to store the configuration of a [`Data2VecForTextModel`] and
+    [`Data2VecForAudioModel`]. It is used to instantiate a Data2Vec model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the Data2Vec [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -115,6 +115,22 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
+        # Here starts audio-only parameters
+        feat_extract_norm="layer",
+        feat_extract_activation="gelu",
+        num_conv_pos_embeddings=19,
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        feat_proj_dropout=0.0,
+        mask_time_prob=0.05,
+        do_stable_layer_norm=False,
+        num_conv_pos_embedding_groups=16,
+        activation_dropout=0.1,
+        add_adapter=False,
+        final_dropout=0.1,
+        proj_codevector_dim=256,
         **kwargs
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -134,6 +150,27 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
         self.classifier_dropout = classifier_dropout
+        # Here starts audio-only parameters
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.conv_dim = conv_dim
+        self.conv_stride = conv_stride
+        self.conv_kernel = conv_kernel
+        self.conv_bias = conv_bias
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.feat_proj_dropout = feat_proj_dropout
+        self.mask_time_prob = mask_time_prob
+        self.do_stable_layer_norm = do_stable_layer_norm
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        # Using the same dropout as for text
+        self.hidden_dropout = self.hidden_dropout_prob
+        # Using the same dropout as for text
+        self.attention_dropout = self.attention_probs_dropout_prob
+        self.activation_dropout = activation_dropout
+        self.add_adapter = add_adapter
+        self.final_dropout = final_dropout
+        self.proj_codevector_dim = proj_codevector_dim
 
 
 class Data2VecOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000..abf8285674f3e
--- /dev/null
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,246 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Wav2Vec2 checkpoint."""
+
+
+import argparse
+import os
+
+import torch
+from datasets import load_dataset
+
+import soundfile as sf
+from transformers import Wav2Vec2Processor, logging
+
+from ..data2vec import Data2VecConfig, Data2VecForAudioModel
+
+# Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
+from .data2vec_audio import Data2VecAudioModel
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+MAPPING = {
+    "post_extract_proj": "feature_projection.projection",
+    "models.0.layer_norm": "feature_projection.layer_norm",
+    "encoder.pos_conv.0": "pos_conv_embed.conv",
+    "self_attn.k_proj": "encoder.layer.*.attention.self.key",
+    "self_attn.v_proj": "encoder.layer.*.attention.self.value",
+    "self_attn.q_proj": "encoder.layer.*.attention.self.query",
+    "self_attn.out_proj.weight": "encoder.layer.*.attention.output.dense",
+    "self_attn_layer_norm": "encoder.layer.*.attention.output.LayerNorm",
+    "fc1": "encoder.layer.*.intermediate.dense",
+    "fc2": "encoder.layer.*.output.dense",
+    "final_layer_norm": "encoder.layer.*.output.LayerNorm",
+    "encoder.layer_norm": "layer_norm",
+    "w2v_model.layer_norm": "feature_projection.layer_norm",
+    "quantizer.weight_proj": "quantizer.weight_proj",
+    "quantizer.vars": "quantizer.codevectors",
+    "project_q": "project_q",
+    # "final_proj": "project_hid",
+    "w2v_encoder.proj": "lm_head",
+    "mask_emb": "masked_spec_embed",
+}
+TOP_LEVEL_KEYS = [
+    "lm_head",
+    "quantizer.weight_proj",
+    "quantizer.codevectors",
+    "project_q",
+    "project_hid",
+]
+
+
+def set_recursively(hf_pointer, key, value, full_name, weight_type):
+    for attribute in key.split("."):
+        hf_pointer = getattr(hf_pointer, attribute)
+
+    if weight_type is not None:
+        hf_shape = getattr(hf_pointer, weight_type).shape
+    else:
+        hf_shape = hf_pointer.shape
+
+    if hf_shape != value.shape:
+        raise ValueError(
+            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be {value.shape} for {full_name}"
+        )
+
+    if weight_type == "weight":
+        hf_pointer.weight.data = value
+    elif weight_type == "weight_g":
+        hf_pointer.weight_g.data = value
+    elif weight_type == "weight_v":
+        hf_pointer.weight_v.data = value
+    elif weight_type == "bias":
+        hf_pointer.bias.data = value
+    else:
+        hf_pointer.data = value
+
+    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
+
+
+def recursively_load_weights(fairseq_model, hf_model, is_headless):
+    unused_weights = []
+    fairseq_dict = fairseq_model.state_dict()
+
+    feature_extractor = hf_model.feature_extractor
+
+    for name, value in fairseq_dict.items():
+        is_used = False
+        if "conv_layers" in name:
+            load_conv_layer(
+                name,
+                value,
+                feature_extractor,
+                unused_weights,
+                hf_model.config.feat_extract_norm == "group",
+            )
+            is_used = True
+        else:
+            for key, mapped_key in MAPPING.items():
+                # mapped_key = "wav2vec2." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
+                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
+                    is_used = True
+                    if "*" in mapped_key:
+                        layer_index = name.split(key)[0].split(".")[-2]
+                        mapped_key = mapped_key.replace("*", layer_index)
+                    if "weight_g" in name:
+                        weight_type = "weight_g"
+                    elif "weight_v" in name:
+                        weight_type = "weight_v"
+                    elif "bias" in name:
+                        weight_type = "bias"
+                    elif "weight" in name:
+                        # TODO: don't match quantizer.weight_proj
+                        weight_type = "weight"
+                    else:
+                        weight_type = None
+                    set_recursively(hf_model, mapped_key, value, name, weight_type)
+                continue
+        if not is_used:
+            unused_weights.append(name)
+
+    logger.warning(f"Unused weights: {unused_weights}")
+
+
+def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
+    name = full_name.split("conv_layers.")[-1]
+    items = name.split(".")
+    layer_id = int(items[0])
+    type_id = int(items[1])
+
+    if type_id == 0:
+        if "bias" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].conv.bias.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].conv.weight.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
+        if "bias" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+    else:
+        unused_weights.append(full_name)
+
+
+@torch.no_grad()
+def convert_wav2vec2_checkpoint(
+    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
+):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = Data2VecConfig.from_pretrained(config_path)
+    else:
+        config = Data2VecConfig()
+
+    # hf_wav2vec = Wav2Vec2ForPreTraining(config)
+    hf_wav2vec = Data2VecForAudioModel(config)
+
+    # Modify final_proj layer name
+    data2vec_checkpoint_dir = os.path.dirname(checkpoint_path)
+    state_dict = torch.load(checkpoint_path)
+    state_dict["model"]["final_proj.weight"] = state_dict["model"].pop("final_proj.0.weight")
+    state_dict["model"]["final_proj.bias"] = state_dict["model"].pop("final_proj.0.bias")
+    converted_ckpt = os.path.join(data2vec_checkpoint_dir, "converted.pt")
+    torch.save(state_dict, converted_ckpt)
+    model = Data2VecAudioModel.from_pretrained(data2vec_checkpoint_dir, checkpoint_file=converted_ckpt)
+    model = model.eval()
+
+    recursively_load_weights(model, hf_wav2vec, not is_finetuned)
+
+    def map_to_array(batch):
+        speech, _ = sf.read(batch["file"])
+        batch["speech"] = speech
+        return batch
+
+    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    ds = ds.map(map_to_array)
+    input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
+    hf_wav2vec.eval()
+    our_output = hf_wav2vec(input_values)["last_hidden_state"]
+    # our_hidden_states = hf_wav2vec(input_values, output_hidden_states=-True).hidden_states
+    model.eval()
+    their_output = model.models[0](input_values, mask=False, features_only=True)["layer_results"][-1][0].transpose(
+        0, 1
+    )
+    # their_hidden_states = model.models[0](input_values, mask=False, features_only=True)["layer_results"]
+
+    print(our_output.shape, their_output.shape)
+    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
+    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
+    success = torch.allclose(our_output, their_output, atol=1e-3)
+    print("Do both models output the same tensors?", "🔥" if success else "💩")
+    if not success:
+        raise Exception("Something went wRoNg")
+
+    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
+    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument(
+        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
+    )
+    args = parser.parse_args()
+    convert_wav2vec2_checkpoint(
+        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
+    )
diff --git a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
similarity index 100%
rename from src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
rename to src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
diff --git a/src/transformers/models/data2vec/data2vec_audio.py b/src/transformers/models/data2vec/data2vec_audio.py
new file mode 100644
index 0000000000000..c9cee07c2671a
--- /dev/null
+++ b/src/transformers/models/data2vec/data2vec_audio.py
@@ -0,0 +1,481 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# TODO: Remove file
+
+import logging
+import math
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq.data.data_utils import compute_mask_indices
+from fairseq.models import BaseFairseqModel, register_model
+from fairseq.models.wav2vec import ConvFeatureExtractionModel, TransformerEncoder, Wav2Vec2Config
+from fairseq.modules import GradMultiply, LayerNorm
+from fairseq.utils import index_put
+
+from omegaconf import II
+
+# Copied from https://github.com/pytorch/fairseq/blob/327cff24a57c2ae06657731bf3be86ee88fccfea/fairseq/modules/ema_module.py
+from .ema_module import EMAModule, EMAModuleConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Data2VecAudioConfig(Wav2Vec2Config):
+
+    loss_beta: float = field(default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"})
+    loss_scale: Optional[float] = field(
+        default=None,
+        metadata={"help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"},
+    )
+    average_top_k_layers: int = field(default=8, metadata={"help": "how many layers to average"})
+
+    layer_norm_target_layer: bool = False
+    instance_norm_target_layer: bool = False
+    instance_norm_targets: bool = False
+    layer_norm_targets: bool = False
+    batch_norm_target_layer: bool = False
+    group_norm_target_layer: bool = False
+
+    ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"})
+    ema_end_decay: float = field(default=0.9999, metadata={"help": "final ema decay rate"})
+
+    # when to finish annealing ema decay rate
+    ema_anneal_end_step: int = II("optimization.max_update")
+
+    ema_transformer_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer"},
+    )
+    ema_layers_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer layers"},
+    )
+
+    max_update: int = II("optimization.max_update")
+
+    min_target_var: float = field(default=0.1, metadata={"help": "stop training if target var falls below this"})
+    min_pred_var: float = field(
+        default=0.01,
+        metadata={"help": "stop training if prediction var falls below this"},
+    )
+
+
+def get_annealed_rate(start, end, curr_step, total_steps):
+    r = end - start
+    pct_remaining = 1 - curr_step / total_steps
+    return end - r * pct_remaining
+
+
+@register_model("data2vec_audio", dataclass=Data2VecAudioConfig)
+class Data2VecAudioModel(BaseFairseqModel):
+    def __init__(self, cfg: Data2VecAudioConfig):
+        super().__init__()
+        self.cfg = cfg
+
+        feature_enc_layers = eval(cfg.conv_feature_layers)
+        self.extractor_embed = feature_enc_layers[-1][0]
+
+        self.ema = None
+        self.embed = cfg.encoder_embed_dim
+
+        self.average_top_k_layers = cfg.average_top_k_layers
+        self.loss_beta = cfg.loss_beta
+        self.loss_scale = cfg.loss_scale
+
+        self.feature_extractor = ConvFeatureExtractionModel(
+            conv_layers=feature_enc_layers,
+            dropout=0.0,
+            mode=cfg.extractor_mode,
+            conv_bias=cfg.conv_bias,
+        )
+
+        self.post_extract_proj = nn.Linear(self.extractor_embed, cfg.encoder_embed_dim)
+
+        self.mask_prob = cfg.mask_prob
+        self.mask_selection = cfg.mask_selection
+        self.mask_other = cfg.mask_other
+        self.mask_length = cfg.mask_length
+        self.no_mask_overlap = cfg.no_mask_overlap
+        self.mask_min_space = cfg.mask_min_space
+
+        self.mask_channel_prob = cfg.mask_channel_prob
+        self.mask_channel_before = cfg.mask_channel_before
+        self.mask_channel_selection = cfg.mask_channel_selection
+        self.mask_channel_other = cfg.mask_channel_other
+        self.mask_channel_length = cfg.mask_channel_length
+        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
+        self.mask_channel_min_space = cfg.mask_channel_min_space
+
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        self.dropout_features = nn.Dropout(cfg.dropout_features)
+
+        self.feature_grad_mult = cfg.feature_grad_mult
+
+        self.mask_emb = nn.Parameter(torch.FloatTensor(cfg.encoder_embed_dim).uniform_())
+
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.extractor_embed)
+
+        self.final_proj = nn.Linear(self.embed, self.embed)
+
+        self.num_updates = 0
+
+    def make_ema_teacher(self):
+        ema_config = EMAModuleConfig(
+            ema_decay=self.cfg.ema_decay,
+            ema_fp32=True,
+        )
+        skip_keys = set()
+        if self.cfg.ema_layers_only:
+            self.cfg.ema_transformer_only = True
+            for k, _ in self.encoder.pos_conv.named_parameters():
+                skip_keys.add(f"pos_conv.{k}")
+
+        self.ema = EMAModule(
+            self.encoder if self.cfg.ema_transformer_only else self,
+            ema_config,
+            skip_keys=skip_keys,
+        )
+
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+
+        if self.ema is None and self.final_proj is not None:
+            logger.info("making ema teacher")
+            self.make_ema_teacher()
+        elif self.training and self.ema is not None:
+            if self.cfg.ema_decay != self.cfg.ema_end_decay:
+                if num_updates >= self.cfg.ema_anneal_end_step:
+                    decay = self.cfg.ema_end_decay
+                else:
+                    decay = get_annealed_rate(
+                        self.cfg.ema_decay,
+                        self.cfg.ema_end_decay,
+                        num_updates,
+                        self.cfg.ema_anneal_end_step,
+                    )
+                self.ema.set_decay(decay)
+            if self.ema.get_decay() < 1:
+                self.ema.step(self.encoder if self.cfg.ema_transformer_only else self)
+
+        self.num_updates = num_updates
+
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        state = super().state_dict(destination, prefix, keep_vars)
+
+        if self.ema is not None:
+            state[prefix + "_ema"] = self.ema.fp32_params
+
+        return state
+
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        if self.ema is not None:
+            k = prefix + "_ema"
+            assert k in state_dict
+            self.ema.restore(state_dict[k], True)
+            del state_dict[k]
+        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+    @classmethod
+    def build_model(cls, cfg: Data2VecAudioConfig, task=None):
+        """Build a new model instance."""
+
+        return cls(cfg)
+
+    def apply_mask(
+        self,
+        x,
+        padding_mask,
+        mask_indices=None,
+        mask_channel_indices=None,
+    ):
+        B, T, C = x.shape
+
+        if self.mask_channel_prob > 0 and self.mask_channel_before:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space,
+            )
+            mask_channel_indices = torch.from_numpy(mask_channel_indices).to(x.device).unsqueeze(1).expand(-1, T, -1)
+            x[mask_channel_indices] = 0
+
+        if self.mask_prob > 0:
+            if mask_indices is None:
+                mask_indices = compute_mask_indices(
+                    (B, T),
+                    padding_mask,
+                    self.mask_prob,
+                    self.mask_length,
+                    self.mask_selection,
+                    self.mask_other,
+                    min_masks=1,
+                    no_overlap=self.no_mask_overlap,
+                    min_space=self.mask_min_space,
+                    require_same_masks=self.cfg.require_same_masks,
+                    mask_dropout=self.cfg.mask_dropout,
+                )
+                mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            x = index_put(x, mask_indices, self.mask_emb)
+        else:
+            mask_indices = None
+
+        if self.mask_channel_prob > 0 and not self.mask_channel_before:
+            if mask_channel_indices is None:
+                mask_channel_indices = compute_mask_indices(
+                    (B, C),
+                    None,
+                    self.mask_channel_prob,
+                    self.mask_channel_length,
+                    self.mask_channel_selection,
+                    self.mask_channel_other,
+                    no_overlap=self.no_mask_channel_overlap,
+                    min_space=self.mask_channel_min_space,
+                )
+                mask_channel_indices = (
+                    torch.from_numpy(mask_channel_indices).to(x.device).unsqueeze(1).expand(-1, T, -1)
+                )
+            x = index_put(x, mask_channel_indices, 0)
+
+        return x, mask_indices
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            return torch.floor((input_length - kernel_size) / stride + 1)
+
+        conv_cfg_list = eval(self.cfg.conv_feature_layers)
+
+        for i in range(len(conv_cfg_list)):
+            input_lengths = _conv_out_length(input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2])
+
+        return input_lengths.to(torch.long)
+
+    def forward(
+        self,
+        source,
+        padding_mask=None,
+        mask=True,
+        features_only=False,
+        layer=None,
+        mask_indices=None,
+        mask_channel_indices=None,
+        padding_count=None,
+    ):
+        features = source
+
+        if self.feature_grad_mult > 0:
+            features = self.feature_extractor(features)
+            if self.feature_grad_mult != 1.0:
+                features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            with torch.no_grad():
+                features = self.feature_extractor(features)
+
+        features = features.transpose(1, 2)
+
+        features = self.layer_norm(features)
+
+        orig_padding_mask = padding_mask
+
+        if padding_mask is not None and padding_mask.any():
+            input_lengths = (1 - padding_mask.long()).sum(-1)
+            # apply conv formula to get real output_lengths
+            output_lengths = self._get_feat_extract_output_lengths(input_lengths)
+
+            padding_mask = torch.zeros(features.shape[:2], dtype=features.dtype, device=features.device)
+
+            # these two operations makes sure that all values
+            # before the output lengths indices are attended to
+            padding_mask[
+                (
+                    torch.arange(padding_mask.shape[0], device=padding_mask.device),
+                    output_lengths - 1,
+                )
+            ] = 1
+            padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool()
+        else:
+            padding_mask = None
+
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+
+        pre_encoder_features = None
+        if self.cfg.ema_transformer_only:
+            pre_encoder_features = features.clone()
+
+        features = self.dropout_input(features)
+
+        if mask:
+            x, mask_indices = self.apply_mask(
+                features,
+                padding_mask,
+                mask_indices=mask_indices,
+                mask_channel_indices=mask_channel_indices,
+            )
+        else:
+            x = features
+            mask_indices = None
+        print(f"Before encoder FB: {x.shape} {x}")
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask,
+            layer=layer,
+        )
+
+        if features_only:
+            return {
+                "x": x,
+                "padding_mask": padding_mask,
+                "layer_results": layer_results,
+            }
+
+        result = {
+            "losses": {},
+        }
+
+        with torch.no_grad():
+            self.ema.model.eval()
+
+            if self.cfg.ema_transformer_only:
+                y, layer_results = self.ema.model.extract_features(
+                    pre_encoder_features,
+                    padding_mask=padding_mask,
+                    min_layer=self.cfg.encoder_layers - self.average_top_k_layers,
+                )
+                y = {
+                    "x": y,
+                    "padding_mask": padding_mask,
+                    "layer_results": layer_results,
+                }
+            else:
+                y = self.ema.model.extract_features(
+                    source=source,
+                    padding_mask=orig_padding_mask,
+                    mask=False,
+                )
+
+            target_layer_results = [l[2] for l in y["layer_results"]]
+
+            permuted = False
+            if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer:
+                target_layer_results = [tl.permute(1, 2, 0) for tl in target_layer_results]  # TBC -> BCT
+                permuted = True
+
+            if self.cfg.batch_norm_target_layer:
+                target_layer_results = [
+                    F.batch_norm(tl.float(), running_mean=None, running_var=None, training=True)
+                    for tl in target_layer_results
+                ]
+
+            if self.cfg.instance_norm_target_layer:
+                target_layer_results = [F.instance_norm(tl.float()) for tl in target_layer_results]
+
+            if permuted:
+                target_layer_results = [tl.transpose(1, 2) for tl in target_layer_results]  # BCT -> BTC
+
+            if self.cfg.group_norm_target_layer:
+                target_layer_results = [F.layer_norm(tl.float(), tl.shape[-2:]) for tl in target_layer_results]
+
+            if self.cfg.layer_norm_target_layer:
+                target_layer_results = [F.layer_norm(tl.float(), tl.shape[-1:]) for tl in target_layer_results]
+
+            y = sum(target_layer_results) / len(target_layer_results)
+
+            if self.cfg.layer_norm_targets:
+                y = F.layer_norm(y.float(), y.shape[-1:])
+
+            if self.cfg.instance_norm_targets:
+                y = F.instance_norm(y.float().transpose(1, 2)).transpose(1, 2)
+
+            if not permuted:
+                y = y.transpose(0, 1)
+
+            y = y[mask_indices]
+
+        x = x[mask_indices]
+        x = self.final_proj(x)
+
+        sz = x.size(-1)
+
+        if self.loss_beta == 0:
+            loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1)
+        else:
+            loss = F.smooth_l1_loss(x.float(), y.float(), reduction="none", beta=self.loss_beta).sum(dim=-1)
+
+        if self.loss_scale is not None:
+            scale = self.loss_scale
+        else:
+            scale = 1 / math.sqrt(sz)
+
+        result["losses"]["regression"] = loss.sum() * scale
+
+        if "sample_size" not in result:
+            result["sample_size"] = loss.numel()
+
+        with torch.no_grad():
+            result["target_var"] = self.compute_var(y)
+            result["pred_var"] = self.compute_var(x.float())
+
+        if self.num_updates > 5000 and result["target_var"] < self.cfg.min_target_var:
+            logger.error(f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting")
+            raise Exception(f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting")
+        if self.num_updates > 5000 and result["pred_var"] < self.cfg.min_pred_var:
+            logger.error(f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting")
+            raise Exception(f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting")
+
+        if self.ema is not None:
+            result["ema_decay"] = self.ema.get_decay() * 1000
+
+        return result
+
+    @staticmethod
+    def compute_var(y):
+        y = y.view(-1, y.size(-1))
+        if dist.is_initialized():
+            zc = torch.tensor(y.size(0)).cuda()
+            zs = y.sum(dim=0)
+            zss = (y ** 2).sum(dim=0)
+
+            dist.all_reduce(zc)
+            dist.all_reduce(zs)
+            dist.all_reduce(zss)
+
+            var = zss / (zc - 1) - (zs ** 2) / (zc * (zc - 1))
+            return torch.sqrt(var + 1e-6).mean()
+        else:
+            return torch.sqrt(y.var(dim=0) + 1e-6).mean()
+
+    def extract_features(self, source, padding_mask, mask=False, layer=None):
+        res = self.forward(
+            source,
+            padding_mask,
+            mask=mask,
+            features_only=True,
+            layer=layer,
+        )
+        return res
+
+    def remove_pretraining_modules(self, last_layer=None):
+        self.final_proj = None
+        self.ema = None
+        if last_layer is not None:
+            self.encoder.layers = nn.ModuleList(l for i, l in enumerate(self.encoder.layers) if i <= last_layer)
diff --git a/src/transformers/models/data2vec/data2vec_text.py b/src/transformers/models/data2vec/data2vec_text.py
index d8cef5f342224..b55dbddd733dd 100644
--- a/src/transformers/models/data2vec/data2vec_text.py
+++ b/src/transformers/models/data2vec/data2vec_text.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+# TODO: Remove file
+
 import logging
 import math
 from dataclasses import dataclass, field
diff --git a/src/transformers/models/data2vec/ema_module.py b/src/transformers/models/data2vec/ema_module.py
new file mode 100644
index 0000000000000..98ceeb4e83735
--- /dev/null
+++ b/src/transformers/models/data2vec/ema_module.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+
+# TODO: Remove file
+
+"""
+Used for EMA tracking a given pytorch module. The user is responsible for calling step() and setting the appropriate
+decay
+"""
+
+import copy
+import logging
+from dataclasses import dataclass, field
+
+import torch
+from fairseq.dataclass import FairseqDataclass
+
+
+@dataclass
+class EMAModuleConfig(FairseqDataclass):
+    ema_decay: float = field(default=0.9999, metadata={"help": "decay for exponential moving average model"})
+    ema_fp32: bool = field(
+        default=False,
+        metadata={"help": "If true, store EMA model in fp32 even if model is in fp16"},
+    )
+
+
+class EMAModule:
+    """Exponential Moving Average of Fairseq Models"""
+
+    def __init__(self, model, config: EMAModuleConfig, device=None, skip_keys=None):
+        """
+        @param model model to initialize the EMA with @param config EMAConfig object with configuration like ema_decay,
+        ema_update_freq, ema_fp32 @param device If provided, copy EMA to this device (e.g. gpu). Otherwise EMA is in
+        the same device as the model.
+        """
+
+        self.decay = config.ema_decay
+        self.model = copy.deepcopy(model)
+        self.model.requires_grad_(False)
+        self.config = config
+        self.skip_keys = skip_keys or set()
+        self.fp32_params = {}
+
+        if device is not None:
+            logging.info(f"Copying EMA model to device {device}")
+            self.model = self.model.to(device=device)
+
+        if self.config.ema_fp32:
+            self.build_fp32_params()
+
+        self.update_freq_counter = 0
+
+    def build_fp32_params(self, state_dict=None):
+        """
+        Store a copy of the EMA params in fp32. If state dict is passed, the EMA params is copied from the provided
+        state dict. Otherwise, it is copied from the current EMA model parameters.
+        """
+        if not self.config.ema_fp32:
+            raise RuntimeError(
+                "build_fp32_params should not be called if ema_fp32=False. "
+                "Use ema_fp32=True if this is really intended."
+            )
+
+        if state_dict is None:
+            state_dict = self.model.state_dict()
+
+        def _to_float(t):
+            return t.float() if torch.is_floating_point(t) else t
+
+        for param_key in state_dict:
+            if param_key in self.fp32_params:
+                self.fp32_params[param_key].copy_(state_dict[param_key])
+            else:
+                self.fp32_params[param_key] = _to_float(state_dict[param_key])
+
+    def restore(self, state_dict, build_fp32_params=False):
+        """Load data from a model spec into EMA model"""
+        self.model.load_state_dict(state_dict, strict=False)
+        if build_fp32_params:
+            self.build_fp32_params(state_dict)
+
+    def set_decay(self, decay):
+        self.decay = decay
+
+    def get_decay(self):
+        return self.decay
+
+    def _step_internal(self, new_model):
+        """One update of the EMA model based on new model weights"""
+        decay = self.decay
+
+        ema_state_dict = {}
+        ema_params = self.fp32_params if self.config.ema_fp32 else self.model.state_dict()
+        for key, param in new_model.state_dict().items():
+            if isinstance(param, dict):
+                continue
+            try:
+                ema_param = ema_params[key]
+            except KeyError:
+                ema_param = param.float().clone() if param.ndim == 1 else copy.deepcopy(param)
+
+            if param.shape != ema_param.shape:
+                raise ValueError(
+                    "incompatible tensor shapes between model param and ema param"
+                    + "{} vs. {}".format(param.shape, ema_param.shape)
+                )
+
+            if "version" in key:
+                # Do not decay a model.version pytorch param
+                continue
+
+            if key in self.skip_keys:
+                ema_param = param.to(dtype=ema_param.dtype).clone()
+                ema_params[key].copy_(ema_param)
+            else:
+                ema_param.mul_(decay)
+                ema_param.add_(param.to(dtype=ema_param.dtype), alpha=1 - decay)
+            ema_state_dict[key] = ema_param
+        self.restore(ema_state_dict, build_fp32_params=False)
+
+    def step(self, new_model):
+        self._step_internal(new_model)
+
+    def reverse(self, model):
+        """
+        Load the model parameters from EMA model. Useful for inference or fine-tuning from the EMA model.
+        """
+        d = self.model.state_dict()
+        if "_ema" in d:
+            del d["_ema"]
+
+        model.load_state_dict(d, strict=False)
+        return model
diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 97aa70b804fa3..2e53b915a65a9 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -45,6 +45,13 @@
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
+from ...models.wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Adapter,
+    Wav2Vec2BaseModelOutput,
+    Wav2Vec2FeatureEncoder,
+    Wav2Vec2FeatureProjection,
+    Wav2Vec2PositionalConvEmbedding,
+)
 from ...utils import logging
 from .configuration_data2vec import Data2VecConfig
 
@@ -684,7 +691,83 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
 
 
 @add_start_docstrings(
-    "The bare Data2Vec Model transformer outputting raw hidden-states without any specific head on top.",
+    "The bare Data2Vec Model for audio transformer outputting raw hidden-states without any specific head on top.",
+    DATA2VEC_START_DOCSTRING,
+)
+class Data2VecForAudioModel(Data2VecPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = Wav2Vec2FeatureEncoder(config)
+        self.feature_projection = Wav2Vec2FeatureProjection(config)
+        self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
+
+        self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None
+
+        self.encoder = Data2VecEncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        print(f"Before positional encoding HF:\n {hidden_states.shape} {hidden_states}")
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return Wav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare Data2Vec Model for text transformer outputting raw hidden-states without any specific head on top.",
     DATA2VEC_START_DOCSTRING,
 )
 class Data2VecForTextModel(Data2VecPreTrainedModel):
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 197874c4c09e8..9012f4b79c74d 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1142,6 +1142,13 @@ def __init__(self, *args, **kwargs):
 DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
+class Data2VecForAudioModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class Data2VecForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index a583196581465..d0148242faeed 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -85,6 +85,7 @@
     "TFRobertaForMultipleChoice",  # TODO: fix
     "TrOCRDecoderWrapper",  # Building part of bigger (tested) model.
     "SeparableConv1D",  # Building part of bigger (tested) model.
+    "Data2VecForAudioModel",  # TODO: Add tests
 ]
 
 # Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't
@@ -164,6 +165,7 @@
     "VisualBertForMultipleChoice",
     "TFWav2Vec2ForCTC",
     "TFHubertForCTC",
+    "Data2VecForAudioModel",  # TODO: Fix
 ]
 
 # This is to make sure the transformers module imported is the one in the repo.

From 2830260344ddf50f83184d945ecb406060007dfb Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 18 Feb 2022 11:40:28 +0000
Subject: [PATCH 039/121] finish audio model

---
 .../models/data2vec/configuration_data2vec.py |   7 +-
 ..._original_pytorch_checkpoint_to_pytorch.py | 204 ++++++----
 .../models/data2vec/data2vec_audio.py         |   1 -
 .../models/data2vec/modeling_data2vec.py      | 354 +++++++++++++++++-
 4 files changed, 466 insertions(+), 100 deletions(-)

diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index dd2260a13cacd..2d423b61d6fe1 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -116,9 +116,9 @@ def __init__(
         use_cache=True,
         classifier_dropout=None,
         # Here starts audio-only parameters
+        layer_norm_eps_audio=1e-5,
         feat_extract_norm="layer",
         feat_extract_activation="gelu",
-        num_conv_pos_embeddings=19,
         conv_dim=(512, 512, 512, 512, 512, 512, 512),
         conv_stride=(5, 2, 2, 2, 2, 2, 2),
         conv_kernel=(10, 3, 3, 3, 3, 2, 2),
@@ -127,6 +127,8 @@ def __init__(
         mask_time_prob=0.05,
         do_stable_layer_norm=False,
         num_conv_pos_embedding_groups=16,
+        conv_pos_kernel_size=19,
+        num_conv_pos_embeddings=5,
         activation_dropout=0.1,
         add_adapter=False,
         final_dropout=0.1,
@@ -153,7 +155,6 @@ def __init__(
         # Here starts audio-only parameters
         self.feat_extract_norm = feat_extract_norm
         self.feat_extract_activation = feat_extract_activation
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
         self.conv_dim = conv_dim
         self.conv_stride = conv_stride
         self.conv_kernel = conv_kernel
@@ -163,6 +164,8 @@ def __init__(
         self.mask_time_prob = mask_time_prob
         self.do_stable_layer_norm = do_stable_layer_norm
         self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.conv_pos_kernel_size = conv_pos_kernel_size
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
         # Using the same dropout as for text
         self.hidden_dropout = self.hidden_dropout_prob
         # Using the same dropout as for text
diff --git a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
index abf8285674f3e..f039e8f89c708 100644
--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@@ -19,15 +19,17 @@
 import os
 
 import torch
-from datasets import load_dataset
+from functools import reduce
 
-import soundfile as sf
 from transformers import Wav2Vec2Processor, logging
+import fairseq
+from datasets import load_dataset
 
-from ..data2vec import Data2VecConfig, Data2VecForAudioModel
+from transformers import Data2VecConfig, Data2VecForAudioModel
+from transformers.models.data2vec.modeling_data2vec import Data2VecForCTC
 
 # Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
-from .data2vec_audio import Data2VecAudioModel
+from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel  # noqa: F401
 
 
 logging.set_verbosity_info()
@@ -36,30 +38,21 @@
 MAPPING = {
     "post_extract_proj": "feature_projection.projection",
     "models.0.layer_norm": "feature_projection.layer_norm",
-    "encoder.pos_conv.0": "pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layer.*.attention.self.key",
-    "self_attn.v_proj": "encoder.layer.*.attention.self.value",
-    "self_attn.q_proj": "encoder.layer.*.attention.self.query",
-    "self_attn.out_proj.weight": "encoder.layer.*.attention.output.dense",
-    "self_attn_layer_norm": "encoder.layer.*.attention.output.LayerNorm",
-    "fc1": "encoder.layer.*.intermediate.dense",
-    "fc2": "encoder.layer.*.output.dense",
-    "final_layer_norm": "encoder.layer.*.output.LayerNorm",
-    "encoder.layer_norm": "layer_norm",
+    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
+    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
+    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
+    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
+    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
+    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
+    "fc2": "encoder.layers.*.feed_forward.output_dense",
+    "final_layer_norm": "encoder.layers.*.final_layer_norm",
+    "encoder.layer_norm": "encoder.layer_norm",
     "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    # "final_proj": "project_hid",
     "w2v_encoder.proj": "lm_head",
     "mask_emb": "masked_spec_embed",
 }
 TOP_LEVEL_KEYS = [
     "lm_head",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
 ]
 
 
@@ -95,7 +88,13 @@ def recursively_load_weights(fairseq_model, hf_model, is_headless):
     unused_weights = []
     fairseq_dict = fairseq_model.state_dict()
 
-    feature_extractor = hf_model.feature_extractor
+    if not is_headless:
+        feature_extractor = hf_model.data2vec.feature_extractor
+        pos_conv_embedding = hf_model.data2vec.encoder.pos_conv_embed
+
+    else:
+        feature_extractor = hf_model.feature_extractor
+        pos_conv_embedding = hf_model.encoder.pos_conv_embed
 
     for name, value in fairseq_dict.items():
         is_used = False
@@ -105,12 +104,20 @@ def recursively_load_weights(fairseq_model, hf_model, is_headless):
                 value,
                 feature_extractor,
                 unused_weights,
-                hf_model.config.feat_extract_norm == "group",
+            )
+            is_used = True
+        elif "pos_conv" in name:
+            load_pos_conv_layer(
+                name,
+                value,
+                pos_conv_embedding,
+                unused_weights,
             )
             is_used = True
         else:
             for key, mapped_key in MAPPING.items():
-                # mapped_key = "wav2vec2." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
+                if not is_headless:
+                    mapped_key = "data2vec." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
                 if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
                     is_used = True
                     if "*" in mapped_key:
@@ -135,44 +142,55 @@ def recursively_load_weights(fairseq_model, hf_model, is_headless):
     logger.warning(f"Unused weights: {unused_weights}")
 
 
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
+def access_by_string(module, path):
+    names = path.split(".")
+    return reduce(getattr, names, module)
+
+
+def set_weights(full_name, module, fsq_value, hf_weight_path):
+    hf_weight = access_by_string(module, hf_weight_path)
+    hf_value = hf_weight.data
+
+    if fsq_value.shape != hf_value.shape:
+        raise ValueError(
+            f"{full_name} has size {fsq_value.shape}, but {hf_value.shape} was found."
+        )
+    hf_weight.data = fsq_value
+    logger.info(f"{full_name} was correctly initialized from {hf_weight_path}.")
+
+
+def load_conv_layer(full_name, value, feature_extractor, unused_weights):
     name = full_name.split("conv_layers.")[-1]
     items = name.split(".")
     layer_id = int(items[0])
     type_id = int(items[1])
 
+    weight_type = name.split(".")[-1]
     if type_id == 0:
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+        layer_type = "conv"
+    elif type_id == 2:
+        layer_type = "layer_norm"
     else:
         unused_weights.append(full_name)
+        return
+
+    set_weights(full_name, feature_extractor, value, f"conv_layers.{layer_id}.{layer_type}.{weight_type}")
+
+
+def load_pos_conv_layer(full_name, value, pos_conv_embeddings, unused_weights):
+    name = full_name.split("pos_conv.")[-1]
+    items = name.split(".")
+    layer_id = int(items[0])
+    type_id = int(items[1])
+
+    weight_type = name.split(".")[-1]
+    if type_id != 0:
+        unused_weights.append(full_name)
+        return
+    else:
+        layer_type = "conv"
+
+    set_weights(full_name, pos_conv_embeddings, value, f"layers.{layer_id}.{layer_type}.{weight_type}")
 
 
 @torch.no_grad()
@@ -185,40 +203,59 @@ def convert_wav2vec2_checkpoint(
     if config_path is not None:
         config = Data2VecConfig.from_pretrained(config_path)
     else:
-        config = Data2VecConfig()
+        config = Data2VecConfig(layer_norm_eps=1e-5, vocab_size=32)
+
+    if not is_finetuned:
+        # Modify final_proj layer name
+        hf_wav2vec = Data2VecForAudioModel(config)
+        data2vec_checkpoint_dir = os.path.dirname(checkpoint_path)
+
+        state_dict = torch.load(checkpoint_path)
+        state_dict["model"]["final_proj.weight"] = state_dict["model"].pop("final_proj.0.weight")
+        state_dict["model"]["final_proj.bias"] = state_dict["model"].pop("final_proj.0.bias")
+        converted_ckpt = os.path.join(data2vec_checkpoint_dir, "converted.pt")
+        torch.save(state_dict, converted_ckpt)
+    else:
+        hf_wav2vec = Data2VecForCTC(config)
+        converted_ckpt = checkpoint_path
 
-    # hf_wav2vec = Wav2Vec2ForPreTraining(config)
-    hf_wav2vec = Data2VecForAudioModel(config)
+    def load_data2vec(path):
+        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([path])
+        return model[0].eval()
 
-    # Modify final_proj layer name
-    data2vec_checkpoint_dir = os.path.dirname(checkpoint_path)
-    state_dict = torch.load(checkpoint_path)
-    state_dict["model"]["final_proj.weight"] = state_dict["model"].pop("final_proj.0.weight")
-    state_dict["model"]["final_proj.bias"] = state_dict["model"].pop("final_proj.0.bias")
-    converted_ckpt = os.path.join(data2vec_checkpoint_dir, "converted.pt")
-    torch.save(state_dict, converted_ckpt)
-    model = Data2VecAudioModel.from_pretrained(data2vec_checkpoint_dir, checkpoint_file=converted_ckpt)
-    model = model.eval()
+    model = load_data2vec(converted_ckpt)
 
     recursively_load_weights(model, hf_wav2vec, not is_finetuned)
 
-    def map_to_array(batch):
-        speech, _ = sf.read(batch["file"])
-        batch["speech"] = speech
-        return batch
+    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
+
+    ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+    input_audio = [x["array"] for x in ds[:4]["audio"]]
+
+    inputs = processor(input_audio, return_tensors="pt", padding=True)
+
+    input_values = inputs.input_values
+    attention_mask = inputs.attention_mask
+#    input_values = inputs.input_values[:, :-1]
+#    attention_mask = inputs.attention_mask[:, :-1]
 
-    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
-    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-    ds = ds.map(map_to_array)
-    input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
     hf_wav2vec.eval()
-    our_output = hf_wav2vec(input_values)["last_hidden_state"]
-    # our_hidden_states = hf_wav2vec(input_values, output_hidden_states=-True).hidden_states
     model.eval()
-    their_output = model.models[0](input_values, mask=False, features_only=True)["layer_results"][-1][0].transpose(
-        0, 1
-    )
-    # their_hidden_states = model.models[0](input_values, mask=False, features_only=True)["layer_results"]
+    if is_finetuned:
+        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)["encoder_out"].transpose(
+            0, 1
+        )
+        our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["logits"]
+
+        pred_ids = torch.argmax(our_output, dim=-1)
+        output_string = processor.batch_decode(pred_ids)
+
+        print(f"Expected Output: {ds[:4]['text']}, Pred: {output_string}")
+    else:
+        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)["layer_results"][-1][0].transpose(
+            0, 1
+        )
+        our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["last_hidden_state"]
 
     print(our_output.shape, their_output.shape)
     max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
@@ -230,6 +267,11 @@ def map_to_array(batch):
 
     hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
 
+    if is_finetuned:
+        processor.save_pretrained(pytorch_dump_folder_path)
+    else:
+        processor.feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/src/transformers/models/data2vec/data2vec_audio.py b/src/transformers/models/data2vec/data2vec_audio.py
index c9cee07c2671a..a5d91b82186f1 100644
--- a/src/transformers/models/data2vec/data2vec_audio.py
+++ b/src/transformers/models/data2vec/data2vec_audio.py
@@ -334,7 +334,6 @@ def forward(
         else:
             x = features
             mask_indices = None
-        print(f"Before encoder FB: {x.shape} {x}")
         x, layer_results = self.encoder(
             x,
             padding_mask=padding_mask,
diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 2e53b915a65a9..a3420d84b195a 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -16,12 +16,14 @@
 
 import math
 
+from typing import Union, Optional
 import torch
 import torch.utils.checkpoint
 from packaging import version
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
+import numpy as np
 from ...activations import ACT2FN, gelu
 from ...file_utils import (
     add_code_sample_docstrings,
@@ -29,6 +31,7 @@
     add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
+from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -38,6 +41,7 @@
     QuestionAnsweringModelOutput,
     SequenceClassifierOutput,
     TokenClassifierOutput,
+    CausalLMOutput,
 )
 from ...modeling_utils import (
     PreTrainedModel,
@@ -45,12 +49,13 @@
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
+from ...modeling_utils import torch_int_div
+from ...modeling_outputs import BaseModelOutput
 from ...models.wav2vec2.modeling_wav2vec2 import (
     Wav2Vec2Adapter,
     Wav2Vec2BaseModelOutput,
     Wav2Vec2FeatureEncoder,
-    Wav2Vec2FeatureProjection,
-    Wav2Vec2PositionalConvEmbedding,
+    Wav2Vec2EncoderLayer,
 )
 from ...utils import logging
 from .configuration_data2vec import Data2VecConfig
@@ -58,10 +63,21 @@
 
 logger = logging.get_logger(__name__)
 
+
+_HIDDEN_STATES_START_POSITION = 2
+
 _CHECKPOINT_FOR_DOC = "data2vec"
 _CONFIG_FOR_DOC = "Data2VecConfig"
 _TOKENIZER_FOR_DOC = "RobertaTokenizer"
 
+# General docstring
+_CONFIG_FOR_DOC = "Wav2Vec2Config"
+_PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 53.48
+
 DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "data2vec",
     # See all data2vec models at https://huggingface.co/models?filter=data2vec
@@ -582,7 +598,6 @@ def forward(self, hidden_states):
         return pooled_output
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->Data2Vec,roberta->data2vec
 class Data2VecPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -593,7 +608,6 @@ class Data2VecPreTrainedModel(PreTrainedModel):
     base_model_prefix = "data2vec"
     supports_gradient_checkpointing = True
 
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
@@ -607,8 +621,10 @@ def _init_weights(self, module):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
         elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+            if hasattr(module, "weight") and module.weight is not None:
+                module.weight.data.fill_(1.0)
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, Data2VecEncoder):
@@ -690,6 +706,157 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
 """
 
 
+class Data2VecPadLayer(nn.Module):
+    def __init__(self, conv_pos_kernel_size):
+        super().__init__()
+        self.num_pad_remove = 1 if conv_pos_kernel_size % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+class Data2VecPositionalConvLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.conv_pos_kernel_size,
+            padding=config.conv_pos_kernel_size // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        self.padding = Data2VecPadLayer(config.conv_pos_kernel_size)
+        self.activation = ACT2FN[config.feat_extract_activation]
+        # no learnable parameters
+        self.layer_norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Data2VecPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layers = nn.ModuleList([Data2VecPositionalConvLayer(config) for _ in range(config.num_conv_pos_embeddings)])
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Data2VecFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+class Data2VecAudioEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = Data2VecPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states[~attention_mask] = 0.0
+
+            # extend attention_mask
+            attention_mask = (1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = np.random.uniform(0, 1)
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(layer),
+                        hidden_states,
+                        attention_mask,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
 @add_start_docstrings(
     "The bare Data2Vec Model for audio transformer outputting raw hidden-states without any specific head on top.",
     DATA2VEC_START_DOCSTRING,
@@ -699,21 +866,63 @@ def __init__(self, config):
         super().__init__(config)
         self.config = config
         self.feature_extractor = Wav2Vec2FeatureEncoder(config)
-        self.feature_projection = Wav2Vec2FeatureProjection(config)
-        self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
-        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feature_projection = Data2VecFeatureProjection(config)
         self.dropout = nn.Dropout(config.hidden_dropout)
 
         # model only needs masking vector if mask prob is > 0.0
         if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
             self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
 
+        self.encoder = Data2VecAudioEncoder(config)
+
         self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None
 
-        self.encoder = Data2VecEncoder(config)
         # Initialize weights and apply final processing
         self.post_init()
 
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch_int_div(input_length - kernel_size, stride) + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
     @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
@@ -737,13 +946,14 @@ def forward(
 
         extract_features = self.feature_extractor(input_values)
         extract_features = extract_features.transpose(1, 2)
-        hidden_states, extract_features = self.feature_projection(extract_features)
-        print(f"Before positional encoding HF:\n {hidden_states.shape} {hidden_states}")
 
-        position_embeddings = self.pos_conv_embed(hidden_states)
-        hidden_states = hidden_states + position_embeddings
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states = self.dropout(hidden_states)
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
 
         encoder_outputs = self.encoder(
             hidden_states,
@@ -766,6 +976,118 @@ def forward(
         )
 
 
+@add_start_docstrings(
+    """Data2Vec Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    DATA2VEC_START_DOCSTRING,
+)
+class Data2VecForCTC(Data2VecPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.data2vec = Data2VecForAudioModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2.feature_extractor._freeze_parameters()
+
+    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_PROCESSOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        labels=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.data2vec(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+
+            if labels.max() >= self.config.vocab_size:
+                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
 @add_start_docstrings(
     "The bare Data2Vec Model for text transformer outputting raw hidden-states without any specific head on top.",
     DATA2VEC_START_DOCSTRING,

From 23b7de8427b0163d0236138811d636a1ae5d8fd4 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 18 Feb 2022 18:13:58 +0000
Subject: [PATCH 040/121] finish audio file

---
 src/transformers/models/data2vec/__init__.py  |   62 +-
 .../data2vec/configuration_data2vec_audio.py  |  320 +++
 ...2vec.py => configuration_data2vec_text.py} |   66 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   21 +-
 .../data2vec/modeling_data2vec_audio.py       | 1831 +++++++++++++++++
 ..._data2vec.py => modeling_data2vec_text.py} |  555 +----
 6 files changed, 2293 insertions(+), 562 deletions(-)
 create mode 100644 src/transformers/models/data2vec/configuration_data2vec_audio.py
 rename src/transformers/models/data2vec/{configuration_data2vec.py => configuration_data2vec_text.py} (72%)
 create mode 100755 src/transformers/models/data2vec/modeling_data2vec_audio.py
 rename src/transformers/models/data2vec/{modeling_data2vec.py => modeling_data2vec_text.py} (75%)

diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
index 00873fff121d3..1abf2a9f9efac 100644
--- a/src/transformers/models/data2vec/__init__.py
+++ b/src/transformers/models/data2vec/__init__.py
@@ -22,38 +22,52 @@
 
 
 _import_structure = {
-    "configuration_data2vec": ["DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecConfig", "Data2VecOnnxConfig"],
+    "configuration_data2vec_text": ["DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecTextConfig", "Data2VecTextOnnxConfig"],
+    "configuration_data2vec_audio": ["DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecAudioConfig", "Data2VecAudioOnnxConfig"],
 }
 
 if is_torch_available():
-    _import_structure["modeling_data2vec"] = [
-        "DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "Data2VecForAudioModel",
-        "Data2VecForCausalLM",
-        "Data2VecForMaskedLM",
-        "Data2VecForMultipleChoice",
-        "Data2VecForQuestionAnswering",
-        "Data2VecForSequenceClassification",
-        "Data2VecForTextModel",
-        "Data2VecForTokenClassification",
-        "Data2VecPreTrainedModel",
+    _import_structure["modeling_data2vec_text"] = [
+        "DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Data2VecTextModel",
+        "Data2VecTextForCausalLM",
+        "Data2VecTextForMaskedLM",
+        "Data2VecTextForMultipleChoice",
+        "Data2VecTextForQuestionAnswering",
+        "Data2VecTextForSequenceClassification",
+        "Data2VecTextForTextModel",
+        "Data2VecTextForTokenClassification",
+        "Data2VecTextPreTrainedModel",
+    ]
+    _import_structure["modeling_data2vec_audio"] = [
+        "DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Data2VecAudioModel",
+        "Data2VecAudioPreTrainedModel",
+        "Data2VecAudioForCTC",
     ]
 
 if TYPE_CHECKING:
-    from .configuration_data2vec import DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecConfig, Data2VecOnnxConfig
+    from .configuration_data2vec_text import DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecTextConfig, Data2VecTextOnnxConfig
+    from .configuration_data2vec_audio import DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecAudioConfig, Data2VecAudioOnnxConfig
 
     if is_torch_available():
-        from .modeling_data2vec import (
-            DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
-            Data2VecForAudioModel,
-            Data2VecForCausalLM,
-            Data2VecForMaskedLM,
-            Data2VecForMultipleChoice,
-            Data2VecForQuestionAnswering,
-            Data2VecForSequenceClassification,
-            Data2VecForTextModel,
-            Data2VecForTokenClassification,
-            Data2VecPreTrainedModel,
+        from .modeling_data2vec_text import (
+            DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Data2VecTextForAudioModel,
+            Data2VecTextForCausalLM,
+            Data2VecTextForMaskedLM,
+            Data2VecTextForMultipleChoice,
+            Data2VecTextForQuestionAnswering,
+            Data2VecTextForSequenceClassification,
+            Data2VecTextForTextModel,
+            Data2VecTextForTokenClassification,
+            Data2VecTextPreTrainedModel,
+        )
+        from .modeling_data2vec_audio import (
+            DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Data2VecAudioModel,
+            Data2VecAudioPreTrainedModel,
+            Data2VecAudioForCTC,
         )
 
 else:
diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py
new file mode 100644
index 0000000000000..b2d1856a63b5e
--- /dev/null
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -0,0 +1,320 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Data2VecAudio model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+DATA_2_VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/data2vec-base-960h": "https://huggingface.co/facebook/data2vec-audio-base-960h/resolve/main/config.json",
+    # See all Data2VecAudio models at https://huggingface.co/models?filter=data2vec-audio
+}
+
+
+class Data2VecAudioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Data2VecAudioModel`]. It is used to instantiate an
+    Data2VecAudio model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Data2VecAudio
+    [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32):
+            Vocabulary size of the Data2VecAudio model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`Data2VecAudioModel`] or [`TFData2VecAudioModel`]. Vocabulary size of the
+            model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward
+            method of [`Data2VecAudioModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`Data2VecAudioForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for quantized feature encoder states.
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
+            Number of entries in each quantization codebook (group).
+        num_codevector_groups (`int`, *optional*, defaults to 2):
+            Number of codevector groups for product codevector quantization.
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
+        num_negatives (`int`, *optional*, defaults to 100):
+            Number of negative samples for the contrastive loss.
+        codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the quantized feature vectors.
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the final projection of both the quantized and the transformer features.
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
+            The weight of the codebook diversity loss component.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`Data2VecAudioForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`Data2VecAudioForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`Data2VecAudioForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification.
+        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        add_adapter (`bool`, *optional*, defaults to `False`):
+            Whether a convolutional network should be stacked on top of the Data2VecAudio Encoder. Can be very useful for
+            warm-starting Data2VecAudio for SpeechEncoderDecoder models.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.
+
+    Example:
+
+    ```python
+    >>> from transformers import Data2VecAudioModel, Data2VecAudioConfig
+
+    >>> # Initializing a Data2VecAudio facebook/wav2vec2-base-960h style configuration
+    >>> configuration = Data2VecAudioConfig()
+
+    >>> # Initializing a model from the facebook/wav2vec2-base-960h style configuration
+    >>> model = Data2VecAudioModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "data2vec-audio"
+
+    def __init__(
+        self,
+        vocab_size=32,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        activation_dropout=0.1,
+        attention_dropout=0.1,
+        feat_proj_dropout=0.0,
+        feat_quantizer_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        feat_extract_norm="group",
+        feat_extract_activation="gelu",
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        num_conv_pos_embedding_groups=16,
+        conv_pos_kernel_size=19,
+        num_conv_pos_embeddings=5,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        num_codevectors_per_group=320,
+        num_codevector_groups=2,
+        contrastive_logits_temperature=0.1,
+        num_negatives=100,
+        codevector_dim=256,
+        proj_codevector_dim=256,
+        diversity_loss_weight=0.1,
+        ctc_loss_reduction="sum",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        tdnn_dim=(512, 512, 512, 512, 1500),
+        tdnn_kernel=(5, 3, 3, 1, 1),
+        tdnn_dilation=(1, 2, 3, 1, 1),
+        xvector_output_dim=512,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        add_adapter=False,
+        adapter_kernel_size=3,
+        adapter_stride=2,
+        num_adapter_layers=3,
+        output_hidden_size=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.conv_pos_kernel_size = conv_pos_kernel_size
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. "
+                "It is required that `len(config.conv_dim)` == `len(config.conv_stride)` == `len(config.conv_kernel)`, "
+                f"but is `len(config.conv_dim) = {len(self.conv_dim)}`, `len(config.conv_stride) "
+                f"= {len(self.conv_stride)}`, `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        # parameters for pretraining with codevector quantized representations
+        self.num_codevectors_per_group = num_codevectors_per_group
+        self.num_codevector_groups = num_codevector_groups
+        self.contrastive_logits_temperature = contrastive_logits_temperature
+        self.feat_quantizer_dropout = feat_quantizer_dropout
+        self.num_negatives = num_negatives
+        self.codevector_dim = codevector_dim
+        self.proj_codevector_dim = proj_codevector_dim
+        self.diversity_loss_weight = diversity_loss_weight
+
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+        # adapter
+        self.add_adapter = add_adapter
+        self.adapter_kernel_size = adapter_kernel_size
+        self.adapter_stride = adapter_stride
+        self.num_adapter_layers = num_adapter_layers
+        self.output_hidden_size = output_hidden_size or hidden_size
+
+        # SequenceClassification-specific parameter. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+
+        # XVector-specific parameters. Feel free to ignore for other classes.
+        self.tdnn_dim = list(tdnn_dim)
+        self.tdnn_kernel = list(tdnn_kernel)
+        self.tdnn_dilation = list(tdnn_dilation)
+        self.xvector_output_dim = xvector_output_dim
diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec_text.py
similarity index 72%
rename from src/transformers/models/data2vec/configuration_data2vec.py
rename to src/transformers/models/data2vec/configuration_data2vec_text.py
index 2d423b61d6fe1..a6843c650cf1c 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_text.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Data2Vec configuration"""
+""" Data2VecText configuration"""
 from collections import OrderedDict
 from typing import Mapping
 
@@ -28,12 +28,12 @@
 }
 
 
-class Data2VecConfig(PretrainedConfig):
+class Data2VecTextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Data2VecForTextModel`] and
-    [`Data2VecForAudioModel`]. It is used to instantiate a Data2Vec model according to the specified arguments,
+    This is the configuration class to store the configuration of a [`Data2VecTextForTextModel`] and
+    [`Data2VecTextForAudioModel`]. It is used to instantiate a Data2VecText model according to the specified arguments,
     defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the Data2Vec [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
+    to that of the Data2VecText [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -82,18 +82,18 @@ class Data2VecConfig(PretrainedConfig):
     Examples:
 
     ```python
-    >>> from transformers import Data2VecForTextModel, Data2VecConfig
+    >>> from transformers import Data2VecTextModel, Data2VecTextConfig
 
-    >>> # Initializing a Data2Vec edugp/data2vec-nlp-base style configuration
-    >>> configuration = Data2VecConfig()
+    >>> # Initializing a Data2VecText edugp/data2vec-nlp-base style configuration
+    >>> configuration = Data2VecTextConfig()
 
     >>> # Initializing a model from the edugp/data2vec-nlp-base style configuration
-    >>> model = Data2VecForTextModel(configuration)
+    >>> model = Data2VecTextModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "data2vec"
+    model_type = "data2vec_text"
 
     def __init__(
         self,
@@ -115,24 +115,6 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        # Here starts audio-only parameters
-        layer_norm_eps_audio=1e-5,
-        feat_extract_norm="layer",
-        feat_extract_activation="gelu",
-        conv_dim=(512, 512, 512, 512, 512, 512, 512),
-        conv_stride=(5, 2, 2, 2, 2, 2, 2),
-        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
-        conv_bias=False,
-        feat_proj_dropout=0.0,
-        mask_time_prob=0.05,
-        do_stable_layer_norm=False,
-        num_conv_pos_embedding_groups=16,
-        conv_pos_kernel_size=19,
-        num_conv_pos_embeddings=5,
-        activation_dropout=0.1,
-        add_adapter=False,
-        final_dropout=0.1,
-        proj_codevector_dim=256,
         **kwargs
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -152,31 +134,9 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
         self.classifier_dropout = classifier_dropout
-        # Here starts audio-only parameters
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_feat_extract_layers = len(self.conv_dim)
-        self.feat_proj_dropout = feat_proj_dropout
-        self.mask_time_prob = mask_time_prob
-        self.do_stable_layer_norm = do_stable_layer_norm
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.conv_pos_kernel_size = conv_pos_kernel_size
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        # Using the same dropout as for text
-        self.hidden_dropout = self.hidden_dropout_prob
-        # Using the same dropout as for text
-        self.attention_dropout = self.attention_probs_dropout_prob
-        self.activation_dropout = activation_dropout
-        self.add_adapter = add_adapter
-        self.final_dropout = final_dropout
-        self.proj_codevector_dim = proj_codevector_dim
-
-
-class Data2VecOnnxConfig(OnnxConfig):
+
+
+class Data2VecTextOnnxConfig(OnnxConfig):
     @property
     def inputs(self) -> Mapping[str, Mapping[int, str]]:
         return OrderedDict(
diff --git a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
index f039e8f89c708..9445afe54049c 100644
--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@@ -25,12 +25,11 @@
 import fairseq
 from datasets import load_dataset
 
-from transformers import Data2VecConfig, Data2VecForAudioModel
-from transformers.models.data2vec.modeling_data2vec import Data2VecForCTC
-
 # Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
-from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel  # noqa: F401
+from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel as Dummy  # noqa: F401
 
+from transformers.models.data2vec.modeling_data2vec_audio import Data2VecAudioModel, Data2VecAudioForCTC
+from transformers.models.data2vec.configuration_data2vec_audio import Data2VecAudioConfig
 
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
@@ -89,8 +88,8 @@ def recursively_load_weights(fairseq_model, hf_model, is_headless):
     fairseq_dict = fairseq_model.state_dict()
 
     if not is_headless:
-        feature_extractor = hf_model.data2vec.feature_extractor
-        pos_conv_embedding = hf_model.data2vec.encoder.pos_conv_embed
+        feature_extractor = hf_model.data2vec_audio.feature_extractor
+        pos_conv_embedding = hf_model.data2vec_audio.encoder.pos_conv_embed
 
     else:
         feature_extractor = hf_model.feature_extractor
@@ -117,7 +116,7 @@ def recursively_load_weights(fairseq_model, hf_model, is_headless):
         else:
             for key, mapped_key in MAPPING.items():
                 if not is_headless:
-                    mapped_key = "data2vec." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
+                    mapped_key = "data2vec_audio." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
                 if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
                     is_used = True
                     if "*" in mapped_key:
@@ -201,13 +200,13 @@ def convert_wav2vec2_checkpoint(
     Copy/paste/tweak model's weights to transformers design.
     """
     if config_path is not None:
-        config = Data2VecConfig.from_pretrained(config_path)
+        config = Data2VecAudioConfig.from_pretrained(config_path)
     else:
-        config = Data2VecConfig(layer_norm_eps=1e-5, vocab_size=32)
+        config = Data2VecAudioConfig()
 
     if not is_finetuned:
         # Modify final_proj layer name
-        hf_wav2vec = Data2VecForAudioModel(config)
+        hf_wav2vec = Data2VecAudioModel(config)
         data2vec_checkpoint_dir = os.path.dirname(checkpoint_path)
 
         state_dict = torch.load(checkpoint_path)
@@ -216,7 +215,7 @@ def convert_wav2vec2_checkpoint(
         converted_ckpt = os.path.join(data2vec_checkpoint_dir, "converted.pt")
         torch.save(state_dict, converted_ckpt)
     else:
-        hf_wav2vec = Data2VecForCTC(config)
+        hf_wav2vec = Data2VecAudioForCTC(config)
         converted_ckpt = checkpoint_path
 
     def load_data2vec(path):
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
new file mode 100755
index 0000000000000..933458d2b5d4d
--- /dev/null
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -0,0 +1,1831 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Data2VecAudio model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...deepspeed import is_deepspeed_zero3_enabled
+from ...file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel, torch_int_div
+from ...utils import logging
+from .configuration_data2vec_audio import Data2VecAudioConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# General docstring
+_CONFIG_FOR_DOC = "Data2VecAudioConfig"
+_PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/data2vec-audio-base-960h"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 53.48
+
+# Audio class docstring
+_FEAT_EXTRACTOR_FOR_DOC = "Wav2Vec2FeatureExtractor"
+_SEQ_CLASS_CHECKPOINT = "superb/data2vec-audio-base-superb-ks"
+_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
+_SEQ_CLASS_EXPECTED_LOSS = 6.54
+
+# Frame class docstring
+_FRAME_CLASS_CHECKPOINT = "anton-l/data2vec-audio-base-superb-sd"
+_FRAME_EXPECTED_OUTPUT = [0, 0]
+
+# Speaker Verification docstring
+_XVECTOR_CHECKPOINT = "anton-l/data2vec-audio-base-superb-sv"
+_XVECTOR_EXPECTED_OUTPUT = 0.98
+
+
+DATA_2_VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/data2vec-audio-base-960h",
+    "facebook/data2vec-audio-large-960h",
+    "facebook/data2vec-audio-large-960h-lv60",
+    "facebook/data2vec-audio-large-960h-lv60-self",
+    # See all Data2VecAudio models at https://huggingface.co/models?filter=data2vec-audio
+]
+
+
+@dataclass
+class Data2VecAudioBaseModelOutput(ModelOutput):
+    """
+    Output type of [`Data2VecAudioBaseModelOutput`], with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        extract_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
+            Sequence of extracted feature vectors of the last convolutional layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    extract_features: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class Data2VecAudioForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`Data2VecAudioForPreTraining`], with potential hidden states and attentions.
+
+    Args:
+        loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
+        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+            projected quantized states.
+        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+            target vectors for contrastive loss.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    projected_states: torch.FloatTensor = None
+    projected_quantized_states: torch.FloatTensor = None
+    codevector_perplexity: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    contrastive_loss: Optional[torch.FloatTensor] = None
+    diversity_loss: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class XVectorOutput(ModelOutput):
+    """
+    Output type of [`Data2VecAudioForXVector`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
+            Classification hidden states before AMSoftmax.
+        embeddings (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
+            Utterance embeddings used for vector similarity-based retrieval.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    embeddings: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.sum(-1).detach().tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+def _sample_negative_indices(
+    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
+):
+    """
+    Sample `num_negatives` vectors from feature vectors.
+    """
+    batch_size, sequence_length = features_shape
+
+    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
+    sequence_length_range = np.arange(sequence_length)
+
+    # get `num_negatives` random vector indices from the same utterance
+    sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
+
+    mask_time_indices = (
+        mask_time_indices.astype(np.bool) if mask_time_indices is not None else np.ones(features_shape, dtype=np.bool)
+    )
+
+    for batch_idx in range(batch_size):
+        high = mask_time_indices[batch_idx].sum() - 1
+        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
+
+        feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
+        sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
+        # avoid sampling the same positive vector, but keep the distribution uniform
+        sampled_indices[sampled_indices >= feature_indices] += 1
+
+        # remap to actual indices
+        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
+
+        # correct for batch size
+        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
+
+    return sampled_negative_indices
+
+
+class Data2VecAudioConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Data2VecAudioPadLayer(nn.Module):
+    def __init__(self, conv_pos_kernel_size):
+        super().__init__()
+        self.num_pad_remove = 1 if conv_pos_kernel_size % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+class Data2VecAudioPositionalConvLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.conv_pos_kernel_size,
+            padding=config.conv_pos_kernel_size // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        self.padding = Data2VecAudioPadLayer(config.conv_pos_kernel_size)
+        self.activation = ACT2FN[config.feat_extract_activation]
+        # no learnable parameters
+        self.layer_norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Data2VecAudioPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layers = nn.ModuleList([Data2VecAudioPositionalConvLayer(config) for _ in range(config.num_conv_pos_embeddings)])
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Data2VecAudioFeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.conv_layers = nn.ModuleList([
+            Data2VecAudioConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
+        ])
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
+
+        for conv_layer in self.conv_layers:
+            if self._requires_grad and self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(conv_layer),
+                    hidden_states,
+                )
+            else:
+                hidden_states = conv_layer(hidden_states)
+
+        return hidden_states
+
+
+class Data2VecAudioFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Data2VecAudio
+class Data2VecAudioAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim ** -0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class Data2VecAudioFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class Data2VecAudioEncoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = Data2VecAudioAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = Data2VecAudioFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Data2VecAudioEncoderLayerStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = Data2VecAudioAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = Data2VecAudioFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Data2VecAudioEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = Data2VecAudioPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Data2VecAudioEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states[~attention_mask] = 0.0
+
+            # extend attention_mask
+            attention_mask = (1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = np.random.uniform(0, 1)
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(layer),
+                        hidden_states,
+                        attention_mask,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Data2VecAudioAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
+        else:
+            self.proj = self.proj_layer_norm = None
+
+        self.layers = nn.ModuleList(Data2VecAudioAdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+
+    def forward(self, hidden_states):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+
+        for layer in self.layers:
+            layerdrop_prob = np.random.random()
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Data2VecAudioAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.output_hidden_size,
+            2 * config.output_hidden_size,
+            config.adapter_kernel_size,
+            stride=config.adapter_stride,
+            padding=1,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+
+        return hidden_states
+
+
+class Data2VecAudioPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Data2VecAudioConfig
+    base_model_prefix = "data2vec_audio"
+    main_input_name = "input_values"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, Data2VecAudioFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            if module.bias is not None:
+                module.bias.data.zero_()
+            if module.weight is not None:
+                module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch_int_div(input_length - kernel_size, stride) + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Data2VecAudioEncoder, Data2VecAudioFeatureEncoder)):
+            module.gradient_checkpointing = value
+
+
+DATA_2_VEC_AUDIO_START_DOCSTRING = r"""
+    Data2VecAudio was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
+    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
+    Auli.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`Data2VecAudioConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+DATA_2_VEC_AUDIO_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
+            into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
+            soundfile*). To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should be used for padding
+            and conversion into a tensor of type *torch.FloatTensor*. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            <Tip warning={true}>
+
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, such as
+            [data2vec-audio-base](https://huggingface.co/facebook/data2vec-audio-base-960h), `attention_mask` should **not** be
+            passed to avoid degraded performance when doing batched inference. For such models `input_values` should
+            simply be padded with 0 and passed without `attention_mask`. Be aware that these models also yield slightly
+            different results depending on whether `input_values` is padded or not.
+
+            </Tip>
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Data2VecAudio Model transformer outputting raw hidden-states without any specific head on top.",
+    DATA_2_VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioModel(Data2VecAudioPreTrainedModel):
+    def __init__(self, config: Data2VecAudioConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = Data2VecAudioFeatureEncoder(config)
+        self.feature_projection = Data2VecAudioFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
+
+        self.encoder = Data2VecAudioEncoder(config)
+
+        self.adapter = Data2VecAudioAdapter(config) if config.add_adapter else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.feature_extractor._freeze_parameters()
+
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+
+        return hidden_states
+
+    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_PROCESSOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Data2VecAudioBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        mask_time_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return Data2VecAudioBaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("""Data2VecAudio Model with a quantizer and `VQ` head on top.""", DATA_2_VEC_AUDIO_START_DOCSTRING)
+class Data2VecAudioForPreTraining(Data2VecAudioPreTrainedModel):
+    def __init__(self, config: Data2VecAudioConfig):
+        super().__init__(config)
+        self.data2vec_audio = Data2VecAudioModel(config)
+        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # make sure that project_hid & project_q are initialized like normal linear layers
+        self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
+        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
+
+    def set_gumbel_temperature(self, temperature: int):
+        """
+        Set the Gumbel softmax temperature to a given value. Only necessary for training
+        """
+        self.quantizer.temperature = temperature
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.data2vec_audio.feature_extractor._freeze_parameters()
+
+    @staticmethod
+    def compute_contrastive_logits(
+        target_features: torch.FloatTensor,
+        negative_features: torch.FloatTensor,
+        predicted_features: torch.FloatTensor,
+        temperature: int = 0.1,
+    ):
+        """
+        Compute logits for contrastive loss based using cosine similarity as the distance measure between
+        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
+        """
+        target_features = torch.cat([target_features, negative_features], dim=0)
+
+        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(
+            target_features
+        )
+
+        # apply temperature
+        logits = logits / temperature
+        return logits
+
+    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Data2VecAudioForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        mask_time_indices=None,
+        sampled_negative_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
+            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
+            Required input for pre-training.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import Wav2Vec2FeatureExtractor, Data2VecAudioForPreTraining
+        >>> from transformers.models.data2vec-audio.modeling_data2vec-audio import _compute_mask_indices
+        >>> from datasets import load_dataset
+        >>> import soundfile as sf
+
+        >>> feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("patrickvonplaten/data2vec-audio-base")
+        >>> model = Data2VecAudioForPreTraining.from_pretrained("patrickvonplaten/data2vec-audio-base")
+
+
+        >>> def map_to_array(batch):
+        ...     speech, _ = sf.read(batch["file"])
+        ...     batch["speech"] = speech
+        ...     return batch
+
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
+
+        >>> input_values = feature_extractor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
+
+        >>> # compute masked indices
+        >>> batch_size, raw_sequence_length = input_values.shape
+        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
+        >>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)
+        >>> mask_time_indices = torch.tensor(mask_time_indices, device=input_values.device, dtype=torch.long)
+
+        >>> with torch.no_grad():
+        ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
+
+        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
+        >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
+
+        >>> # show that cosine similarity is much higher than random
+        >>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5
+        tensor(True)
+
+        >>> # for contrastive loss training model should be put into train mode
+        >>> model = model.train()
+        >>> loss = model(input_values, mask_time_indices=mask_time_indices).loss
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if mask_time_indices is not None:
+            mask_time_indices = mask_time_indices.to(torch.bool)
+
+        outputs = self.data2vec_audio(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            mask_time_indices=mask_time_indices,
+            return_dict=return_dict,
+        )
+
+        # 1. project all transformed features (including masked) to final vq dim
+        transformer_features = self.project_hid(outputs[0])
+
+        # 2. quantize all (unmasked) extracted features and project to final vq dim
+        extract_features = self.dropout_features(outputs[1])
+
+        if attention_mask is not None:
+            # compute reduced attention_mask correponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        quantized_features, codevector_perplexity = self.quantizer(
+            extract_features, mask_time_indices=mask_time_indices
+        )
+        quantized_features = self.project_q(quantized_features)
+
+        loss = contrastive_loss = diversity_loss = None
+        if sampled_negative_indices is not None:
+            batch_size, sequence_length, hidden_size = quantized_features.shape
+
+            # for training, we sample negatives
+            # 3. sample K negatives (distractors) quantized states for contrastive loss
+            # if attention_mask is passed, make sure that padded feature vectors cannot be sampled
+            # sample negative quantized vectors BTC => (BxT)C
+            negative_quantized_features = quantized_features.view(-1, hidden_size)[
+                sampled_negative_indices.long().view(-1)
+            ]
+            negative_quantized_features = negative_quantized_features.view(
+                batch_size, sequence_length, -1, hidden_size
+            ).permute(2, 0, 1, 3)
+
+            # 4. compute logits, corresponding to `logs = sim(c_t, [q_t, \sim{q}_t]) / \kappa`
+            # of equation (3) in https://arxiv.org/pdf/2006.11477.pdf
+            logits = self.compute_contrastive_logits(
+                quantized_features[None, :],
+                negative_quantized_features,
+                transformer_features,
+                self.config.contrastive_logits_temperature,
+            )
+
+            # 5. if a negative vector is identical to the positive (i.e. when codebook utilization is low),
+            # its cosine similarity will be masked
+            neg_is_pos = (quantized_features == negative_quantized_features).all(-1)
+
+            if neg_is_pos.any():
+                logits[1:][neg_is_pos] = float("-inf")
+
+            # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) =
+            # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
+            logits = logits.transpose(0, 2).reshape(-1, logits.size(0))
+            target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten()
+
+            contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum")
+            # 7. compute diversity loss: \mathbf{L}_d
+            num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups
+            diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum()
+
+            # 8. \mathbf{L} = \mathbf{L}_m + \alpha * \mathbf{L}_d
+            loss = contrastive_loss + self.config.diversity_loss_weight * diversity_loss
+
+        if not return_dict:
+            if loss is not None:
+                return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+            return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+
+        return Data2VecAudioForPreTrainingOutput(
+            loss=loss,
+            projected_states=transformer_features,
+            projected_quantized_states=quantized_features,
+            codevector_perplexity=codevector_perplexity,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            contrastive_loss=contrastive_loss,
+            diversity_loss=diversity_loss,
+        )
+
+
+@add_start_docstrings(
+    """Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    DATA_2_VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForCTC(Data2VecAudioPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.data2vec_audio = Data2VecAudioModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.data2vec_audio.feature_extractor._freeze_parameters()
+
+    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_PROCESSOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        labels=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.data2vec_audio(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+
+            if labels.max() >= self.config.vocab_size:
+                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@add_start_docstrings(
+    """
+    Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
+    """,
+    DATA_2_VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.data2vec_audio = Data2VecAudioModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.data2vec_audio.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.data2vec_audio.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_SEQ_CLASS_CHECKPOINT,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        labels=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.data2vec_audio(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            hidden_states[~padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Data2VecAudio Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    DATA_2_VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.data2vec_audio = Data2VecAudioModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.data2vec_audio.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.data2vec_audio.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_FRAME_CLASS_CHECKPOINT,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_FRAME_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.data2vec_audio(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        logits = self.classifier(hidden_states)
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+
+        return TokenClassifierOutput(
+            loss=None,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class AMSoftmaxLoss(nn.Module):
+    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
+        super(AMSoftmaxLoss, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.num_labels = num_labels
+        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, hidden_states, labels):
+        labels = labels.flatten()
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
+        cos_theta = torch.mm(hidden_states, weight)
+        psi = cos_theta - self.margin
+
+        onehot = nn.functional.one_hot(labels, self.num_labels)
+        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
+        loss = self.loss(logits, labels)
+
+        return loss
+
+
+class TDNNLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
+        self.out_conv_dim = config.tdnn_dim[layer_id]
+        self.kernel_size = config.tdnn_kernel[layer_id]
+        self.dilation = config.tdnn_dilation[layer_id]
+
+        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
+        self.activation = nn.ReLU()
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.unsqueeze(1)
+        hidden_states = nn.functional.unfold(
+            hidden_states,
+            (self.kernel_size, self.in_conv_dim),
+            stride=(1, self.in_conv_dim),
+            dilation=(self.dilation, 1),
+        )
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.kernel(hidden_states)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+@add_start_docstrings(
+    """
+    Data2VecAudio Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    DATA_2_VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.data2vec_audio = Data2VecAudioModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
+
+        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
+        self.tdnn = nn.ModuleList(tdnn_layers)
+
+        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
+        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
+
+        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
+
+        self.init_weights()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.data2vec_audio.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.data2vec_audio.parameters():
+            param.requires_grad = False
+
+    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the TDNN layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size in self.config.tdnn_kernel:
+            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
+
+        return input_lengths
+
+    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_XVECTOR_CHECKPOINT,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_XVECTOR_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        labels=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.data2vec_audio(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
similarity index 75%
rename from src/transformers/models/data2vec/modeling_data2vec.py
rename to src/transformers/models/data2vec/modeling_data2vec_text.py
index a3420d84b195a..75ba37c2a4152 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -12,18 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch Data2Vec model."""
+"""PyTorch Data2VecText model."""
 
 import math
 
-from typing import Union, Optional
 import torch
 import torch.utils.checkpoint
 from packaging import version
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-import numpy as np
 from ...activations import ACT2FN, gelu
 from ...file_utils import (
     add_code_sample_docstrings,
@@ -31,7 +29,6 @@
     add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
-from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -41,7 +38,6 @@
     QuestionAnsweringModelOutput,
     SequenceClassifierOutput,
     TokenClassifierOutput,
-    CausalLMOutput,
 )
 from ...modeling_utils import (
     PreTrainedModel,
@@ -49,16 +45,8 @@
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...modeling_utils import torch_int_div
-from ...modeling_outputs import BaseModelOutput
-from ...models.wav2vec2.modeling_wav2vec2 import (
-    Wav2Vec2Adapter,
-    Wav2Vec2BaseModelOutput,
-    Wav2Vec2FeatureEncoder,
-    Wav2Vec2EncoderLayer,
-)
 from ...utils import logging
-from .configuration_data2vec import Data2VecConfig
+from .configuration_data2vec import Data2VecTextConfig
 
 
 logger = logging.get_logger(__name__)
@@ -67,7 +55,7 @@
 _HIDDEN_STATES_START_POSITION = 2
 
 _CHECKPOINT_FOR_DOC = "data2vec"
-_CONFIG_FOR_DOC = "Data2VecConfig"
+_CONFIG_FOR_DOC = "Data2VecTextConfig"
 _TOKENIZER_FOR_DOC = "RobertaTokenizer"
 
 # General docstring
@@ -78,14 +66,14 @@
 _CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
 _CTC_EXPECTED_LOSS = 53.48
 
-DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST = [
+DATA2VECTEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "data2vec",
     # See all data2vec models at https://huggingface.co/models?filter=data2vec
 ]
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Data2Vec
-class Data2VecForTextEmbeddings(nn.Module):
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Data2VecText
+class Data2VecTextForTextEmbeddings(nn.Module):
     """
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
@@ -175,7 +163,7 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
         return position_ids.unsqueeze(0).expand(input_shape)
 
 
-class Data2VecSelfAttention(nn.Module):
+class Data2VecTextSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
@@ -275,7 +263,7 @@ def forward(
 
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in Data2VecForTextModel forward() function)
+            # Apply the attention mask is (precomputed for all layers in Data2VecTextModel forward() function)
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
@@ -303,7 +291,7 @@ def forward(
 
 
 # Copied from transformers.models.bert.modeling_bert.BertSelfOutput
-class Data2VecSelfOutput(nn.Module):
+class Data2VecTextSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -317,12 +305,12 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Data2Vec
-class Data2VecAttention(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Data2VecText
+class Data2VecTextAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = Data2VecSelfAttention(config, position_embedding_type=position_embedding_type)
-        self.output = Data2VecSelfOutput(config)
+        self.self = Data2VecTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = Data2VecTextSelfOutput(config)
         self.pruned_heads = set()
 
     def prune_heads(self, heads):
@@ -368,7 +356,7 @@ def forward(
 
 
 # Copied from transformers.models.bert.modeling_bert.BertIntermediate
-class Data2VecIntermediate(nn.Module):
+class Data2VecTextIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
@@ -384,7 +372,7 @@ def forward(self, hidden_states):
 
 
 # Copied from transformers.models.bert.modeling_bert.BertOutput
-class Data2VecOutput(nn.Module):
+class Data2VecTextOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
@@ -398,21 +386,21 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Data2Vec
-class Data2VecLayer(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Data2VecText
+class Data2VecTextLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = Data2VecAttention(config)
+        self.attention = Data2VecTextAttention(config)
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         if self.add_cross_attention:
             if not self.is_decoder:
                 raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
-            self.crossattention = Data2VecAttention(config, position_embedding_type="absolute")
-        self.intermediate = Data2VecIntermediate(config)
-        self.output = Data2VecOutput(config)
+            self.crossattention = Data2VecTextAttention(config, position_embedding_type="absolute")
+        self.intermediate = Data2VecTextIntermediate(config)
+        self.output = Data2VecTextOutput(config)
 
     def forward(
         self,
@@ -484,12 +472,12 @@ def feed_forward_chunk(self, attention_output):
         return layer_output
 
 
-# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Data2Vec
-class Data2VecEncoder(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Data2VecText
+class Data2VecTextEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
-        self.layer = nn.ModuleList([Data2VecLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList([Data2VecTextLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
     def forward(
@@ -583,7 +571,7 @@ def custom_forward(*inputs):
 
 
 # Copied from transformers.models.bert.modeling_bert.BertPooler
-class Data2VecPooler(nn.Module):
+class Data2VecTextPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -598,13 +586,14 @@ def forward(self, hidden_states):
         return pooled_output
 
 
-class Data2VecPreTrainedModel(PreTrainedModel):
+# Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->Data2VecText, roberta->data2vec-text
+class Data2VecTextPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
 
-    config_class = Data2VecConfig
+    config_class = Data2VecTextConfig
     base_model_prefix = "data2vec"
     supports_gradient_checkpointing = True
 
@@ -627,7 +616,7 @@ def _init_weights(self, module):
                 module.weight.data.fill_(1.0)
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, Data2VecEncoder):
+        if isinstance(module, Data2VecTextEncoder):
             module.gradient_checkpointing = value
 
     def update_keys_to_ignore(self, config, del_keys_to_ignore):
@@ -640,7 +629,7 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
             ]
 
 
-DATA2VEC_START_DOCSTRING = r"""
+DATA2VECTEXT_START_DOCSTRING = r"""
 
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -651,12 +640,12 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
     and behavior.
 
     Parameters:
-        config ([`Data2VecConfig`]): Model configuration class with all the parameters of the
+        config ([`Data2VecTextConfig`]): Model configuration class with all the parameters of the
             model. Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-DATA2VEC_INPUTS_DOCSTRING = r"""
+DATA2VECTEXT_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
@@ -706,393 +695,11 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
 """
 
 
-class Data2VecPadLayer(nn.Module):
-    def __init__(self, conv_pos_kernel_size):
-        super().__init__()
-        self.num_pad_remove = 1 if conv_pos_kernel_size % 2 == 0 else 0
-
-    def forward(self, hidden_states):
-        if self.num_pad_remove > 0:
-            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
-        return hidden_states
-
-
-class Data2VecPositionalConvLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.conv = nn.Conv1d(
-            config.hidden_size,
-            config.hidden_size,
-            kernel_size=config.conv_pos_kernel_size,
-            padding=config.conv_pos_kernel_size // 2,
-            groups=config.num_conv_pos_embedding_groups,
-        )
-
-        self.padding = Data2VecPadLayer(config.conv_pos_kernel_size)
-        self.activation = ACT2FN[config.feat_extract_activation]
-        # no learnable parameters
-        self.layer_norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False)
-
-    def forward(self, hidden_states):
-        hidden_states = self.conv(hidden_states)
-        hidden_states = self.padding(hidden_states)
-
-        hidden_states = hidden_states.transpose(1, 2)
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states = hidden_states.transpose(1, 2)
-        hidden_states = self.activation(hidden_states)
-        return hidden_states
-
-
-class Data2VecPositionalConvEmbedding(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.layers = nn.ModuleList([Data2VecPositionalConvLayer(config) for _ in range(config.num_conv_pos_embeddings)])
-
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.transpose(1, 2)
-        for layer in self.layers:
-            hidden_states = layer(hidden_states)
-        hidden_states = hidden_states.transpose(1, 2)
-        return hidden_states
-
-
-class Data2VecFeatureProjection(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
-        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
-        self.dropout = nn.Dropout(config.feat_proj_dropout)
-
-    def forward(self, hidden_states):
-        # non-projected hidden states are needed for quantization
-        norm_hidden_states = self.layer_norm(hidden_states)
-        hidden_states = self.projection(norm_hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        return hidden_states, norm_hidden_states
-
-
-class Data2VecAudioEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.pos_conv_embed = Data2VecPositionalConvEmbedding(config)
-        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout)
-        self.layers = nn.ModuleList([Wav2Vec2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-    ):
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-
-        if attention_mask is not None:
-            # make sure padded tokens output 0
-            hidden_states[~attention_mask] = 0.0
-
-            # extend attention_mask
-            attention_mask = (1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)) * -10000.0
-            attention_mask = attention_mask.expand(
-                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
-            )
-
-        position_embeddings = self.pos_conv_embed(hidden_states)
-        hidden_states = hidden_states + position_embeddings
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
-
-        for layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
-
-            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
-                if self.gradient_checkpointing and self.training:
-                    # create gradient checkpointing function
-                    def create_custom_forward(module):
-                        def custom_forward(*inputs):
-                            return module(*inputs, output_attentions)
-
-                        return custom_forward
-
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(layer),
-                        hidden_states,
-                        attention_mask,
-                    )
-                else:
-                    layer_outputs = layer(
-                        hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
-                    )
-                hidden_states = layer_outputs[0]
-
-            if skip_the_layer:
-                layer_outputs = (None, None)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
-@add_start_docstrings(
-    "The bare Data2Vec Model for audio transformer outputting raw hidden-states without any specific head on top.",
-    DATA2VEC_START_DOCSTRING,
-)
-class Data2VecForAudioModel(Data2VecPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.config = config
-        self.feature_extractor = Wav2Vec2FeatureEncoder(config)
-        self.feature_projection = Data2VecFeatureProjection(config)
-        self.dropout = nn.Dropout(config.hidden_dropout)
-
-        # model only needs masking vector if mask prob is > 0.0
-        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
-            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
-
-        self.encoder = Data2VecAudioEncoder(config)
-
-        self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def _get_feat_extract_output_lengths(
-        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
-    ):
-        """
-        Computes the output length of the convolutional layers
-        """
-
-        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
-
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
-
-        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
-            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
-
-        if add_adapter:
-            for _ in range(self.config.num_adapter_layers):
-                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
-
-        return input_lengths
-
-    def _get_feature_vector_attention_mask(
-        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
-    ):
-        # Effectively attention_mask.sum(-1), but not inplace to be able to run
-        # on inference mode.
-        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
-
-        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
-        output_lengths = output_lengths.to(torch.long)
-
-        batch_size = attention_mask.shape[0]
-
-        attention_mask = torch.zeros(
-            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
-        )
-        # these two operations makes sure that all values before the output lengths idxs are attended to
-        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
-        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
-        return attention_mask
-
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_values,
-        attention_mask=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        extract_features = self.feature_extractor(input_values)
-        extract_features = extract_features.transpose(1, 2)
-
-        if attention_mask is not None:
-            # compute reduced attention_mask corresponding to feature vectors
-            attention_mask = self._get_feature_vector_attention_mask(
-                extract_features.shape[1], attention_mask, add_adapter=False
-            )
-
-        hidden_states, extract_features = self.feature_projection(extract_features)
-
-        encoder_outputs = self.encoder(
-            hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = encoder_outputs[0]
-
-        if not return_dict:
-            return (hidden_states, extract_features) + encoder_outputs[1:]
-
-        return Wav2Vec2BaseModelOutput(
-            last_hidden_state=hidden_states,
-            extract_features=extract_features,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """Data2Vec Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
-    DATA2VEC_START_DOCSTRING,
-)
-class Data2VecForCTC(Data2VecPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.data2vec = Data2VecForAudioModel(config)
-        self.dropout = nn.Dropout(config.final_dropout)
-
-        if config.vocab_size is None:
-            raise ValueError(
-                f"You are trying to instantiate {self.__class__} with a configuration that "
-                "does not define the vocabulary size of the language model head. Please "
-                "instantiate the model as follows: `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
-                "or define `vocab_size` of your model's configuration."
-            )
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def freeze_feature_encoder(self):
-        """
-        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
-        not be updated during training.
-        """
-        self.wav2vec2.feature_extractor._freeze_parameters()
-
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=CausalLMOutput,
-        config_class=_CONFIG_FOR_DOC,
-        expected_output=_CTC_EXPECTED_OUTPUT,
-        expected_loss=_CTC_EXPECTED_LOSS,
-    )
-    def forward(
-        self,
-        input_values,
-        attention_mask=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        labels=None,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
-            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
-            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
-            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
-            config.vocab_size - 1]`.
-        """
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.data2vec(
-            input_values,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        hidden_states = self.dropout(hidden_states)
-
-        logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-
-            if labels.max() >= self.config.vocab_size:
-                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
-
-            # retrieve loss input_lengths from attention_mask
-            attention_mask = (
-                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
-            )
-            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
-
-            # assuming that padded tokens are filled with -100
-            # when not being attended to
-            labels_mask = labels >= 0
-            target_lengths = labels_mask.sum(-1)
-            flattened_targets = labels.masked_select(labels_mask)
-
-            # ctc_loss doesn't support fp16
-            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
-
-            with torch.backends.cudnn.flags(enabled=False):
-                loss = nn.functional.ctc_loss(
-                    log_probs,
-                    flattened_targets,
-                    input_lengths,
-                    target_lengths,
-                    blank=self.config.pad_token_id,
-                    reduction=self.config.ctc_loss_reduction,
-                    zero_infinity=self.config.ctc_zero_infinity,
-                )
-
-        if not return_dict:
-            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
-        )
-
-
 @add_start_docstrings(
-    "The bare Data2Vec Model for text transformer outputting raw hidden-states without any specific head on top.",
-    DATA2VEC_START_DOCSTRING,
+    "The bare Data2VecText Model for text transformer outputting raw hidden-states without any specific head on top.",
+    DATA2VECTEXT_START_DOCSTRING,
 )
-class Data2VecForTextModel(Data2VecPreTrainedModel):
+class Data2VecTextModel(Data2VecTextPreTrainedModel):
     """
 
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
@@ -1114,10 +721,10 @@ def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
 
-        self.embeddings = Data2VecForTextEmbeddings(config)
-        self.encoder = Data2VecEncoder(config)
+        self.embeddings = Data2VecTextForTextEmbeddings(config)
+        self.encoder = Data2VecTextEncoder(config)
 
-        self.pooler = Data2VecPooler(config) if add_pooling_layer else None
+        self.pooler = Data2VecTextPooler(config) if add_pooling_layer else None
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1136,7 +743,7 @@ class PreTrainedModel
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1275,9 +882,9 @@ def forward(
 
 
 @add_start_docstrings(
-    """Data2Vec Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VEC_START_DOCSTRING
+    """Data2VecText Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VECTEXT_START_DOCSTRING
 )
-class Data2VecForCausalLM(Data2VecPreTrainedModel):
+class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
@@ -1286,10 +893,10 @@ def __init__(self, config):
         super().__init__(config)
 
         if not config.is_decoder:
-            logger.warning("If you want to use `Data2VecLMHeadModel` as a standalone, add `is_decoder=True.`")
+            logger.warning("If you want to use `Data2VecTextLMHeadModel` as a standalone, add `is_decoder=True.`")
 
-        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
-        self.lm_head = Data2VecLMHead(config)
+        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.lm_head = Data2VecTextLMHead(config)
 
         # The LM head weights require special treatment only when they are tied with the word embeddings
         self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1303,7 +910,7 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head.decoder = new_embeddings
 
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1352,13 +959,13 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import Data2VecTokenizer, Data2VecForCausalLM, Data2VecConfig
+        >>> from transformers import Data2VecTextTokenizer, Data2VecTextForCausalLM, Data2VecTextConfig
         >>> import torch
 
-        >>> tokenizer = Data2VecTokenizer.from_pretrained("data2vec-base")
-        >>> config = Data2VecConfig.from_pretrained("data2vec-base")
+        >>> tokenizer = Data2VecTextTokenizer.from_pretrained("data2vec-base")
+        >>> config = Data2VecTextConfig.from_pretrained("data2vec-base")
         >>> config.is_decoder = True
-        >>> model = Data2VecForCausalLM.from_pretrained("data2vec-base", config=config)
+        >>> model = Data2VecTextForCausalLM.from_pretrained("data2vec-base", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1428,8 +1035,8 @@ def _reorder_cache(self, past, beam_idx):
         return reordered_past
 
 
-@add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VEC_START_DOCSTRING)
-class Data2VecForMaskedLM(Data2VecPreTrainedModel):
+@add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VECTEXT_START_DOCSTRING)
+class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
@@ -1439,12 +1046,12 @@ def __init__(self, config):
 
         if config.is_decoder:
             logger.warning(
-                "If you want to use `Data2VecForMaskedLM` make sure `config.is_decoder=False` for "
+                "If you want to use `Data2VecTextForMaskedLM` make sure `config.is_decoder=False` for "
                 "bi-directional self-attention."
             )
 
-        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
-        self.lm_head = Data2VecLMHead(config)
+        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.lm_head = Data2VecTextLMHead(config)
 
         # The LM head weights require special treatment only when they are tied with the word embeddings
         self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1458,7 +1065,7 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head.decoder = new_embeddings
 
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1524,9 +1131,9 @@ def forward(
         )
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->Data2Vec
-class Data2VecLMHead(nn.Module):
-    """Data2Vec Head for masked language modeling."""
+# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->Data2VecText
+class Data2VecTextLMHead(nn.Module):
+    """Data2VecText Head for masked language modeling."""
 
     def __init__(self, config):
         super().__init__()
@@ -1554,12 +1161,12 @@ def _tie_weights(self):
 
 @add_start_docstrings(
     """
-    Data2Vec Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    Data2VecText Model transformer with a sequence classification/regression head on top (a linear layer on top of the
     pooled output) e.g. for GLUE tasks.
     """,
-    DATA2VEC_START_DOCSTRING,
+    DATA2VECTEXT_START_DOCSTRING,
 )
-class Data2VecForSequenceClassification(Data2VecPreTrainedModel):
+class Data2VecTextForSequenceClassification(Data2VecTextPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def __init__(self, config):
@@ -1567,13 +1174,13 @@ def __init__(self, config):
         self.num_labels = config.num_labels
         self.config = config
 
-        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
-        self.classifier = Data2VecClassificationHead(config)
+        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.classifier = Data2VecTextClassificationHead(config)
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1652,18 +1259,18 @@ def forward(
 
 @add_start_docstrings(
     """
-    Data2Vec Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    Data2VecText Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
     softmax) e.g. for RocStories/SWAG tasks.
     """,
-    DATA2VEC_START_DOCSTRING,
+    DATA2VECTEXT_START_DOCSTRING,
 )
-class Data2VecForMultipleChoice(Data2VecPreTrainedModel):
+class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def __init__(self, config):
         super().__init__(config)
 
-        self.data2vec = Data2VecForTextModel(config)
+        self.data2vec = Data2VecTextModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, 1)
 
@@ -1671,7 +1278,7 @@ def __init__(self, config):
         self.post_init()
 
     @add_start_docstrings_to_model_forward(
-        DATA2VEC_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+        DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
@@ -1747,12 +1354,12 @@ def forward(
 
 @add_start_docstrings(
     """
-    Data2Vec Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Data2VecText Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
     Named-Entity-Recognition (NER) tasks.
     """,
-    DATA2VEC_START_DOCSTRING,
+    DATA2VECTEXT_START_DOCSTRING,
 )
-class Data2VecForTokenClassification(Data2VecPreTrainedModel):
+class Data2VecTextForTokenClassification(Data2VecTextPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
@@ -1760,7 +1367,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
+        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
         classifier_dropout = (
             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
         )
@@ -1770,7 +1377,7 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1830,8 +1437,8 @@ def forward(
         )
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->Data2Vec
-class Data2VecClassificationHead(nn.Module):
+# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->Data2VecText
+class Data2VecTextClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
 
     def __init__(self, config):
@@ -1855,12 +1462,12 @@ def forward(self, features, **kwargs):
 
 @add_start_docstrings(
     """
-    Data2Vec Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    Data2VecText Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
     layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
     """,
-    DATA2VEC_START_DOCSTRING,
+    DATA2VECTEXT_START_DOCSTRING,
 )
-class Data2VecForQuestionAnswering(Data2VecPreTrainedModel):
+class Data2VecTextForQuestionAnswering(Data2VecTextPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
@@ -1868,13 +1475,13 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
+        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,

From 9e5ac324fbe6768e6f1ba8e8052804ebaf560b44 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sun, 20 Feb 2022 15:00:43 -0500
Subject: [PATCH 041/121] Update names and fix style, quality and repo
 consistency

---
 docs/source/model_doc/data2vec.mdx            |  66 ++-
 src/transformers/__init__.py                  |  52 +-
 .../models/auto/configuration_auto.py         |   4 +-
 src/transformers/models/auto/modeling_auto.py |  19 +-
 src/transformers/models/data2vec/__init__.py  |  58 +-
 ...vec_audio.py => configuration_data2vec.py} | 150 ++++-
 .../data2vec/configuration_data2vec_text.py   | 147 -----
 ..._original_pytorch_checkpoint_to_pytorch.py |  30 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |  10 +-
 .../data2vec/modeling_data2vec_audio.py       |  63 +--
 .../models/data2vec/modeling_data2vec_text.py |  13 +-
 src/transformers/utils/dummy_pt_objects.py    |  60 +-
 tests/test_modeling_data2vec.py               | 519 -----------------
 tests/test_modeling_data2vec_audio.py         |  14 +
 tests/test_modeling_data2vec_text.py          | 534 ++++++++++++++++++
 utils/check_repo.py                           |   6 +-
 utils/tests_fetcher.py                        |   1 +
 17 files changed, 930 insertions(+), 816 deletions(-)
 rename src/transformers/models/data2vec/{configuration_data2vec_audio.py => configuration_data2vec.py} (71%)
 delete mode 100644 src/transformers/models/data2vec/configuration_data2vec_text.py
 create mode 100644 tests/test_modeling_data2vec_audio.py
 create mode 100644 tests/test_modeling_data2vec_text.py

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
index 2471ccc883c32..6474ceb4758ca 100644
--- a/docs/source/model_doc/data2vec.mdx
+++ b/docs/source/model_doc/data2vec.mdx
@@ -40,46 +40,76 @@ This model was contributed by [edugp](https://huggingface.co/edugp).
 The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/data2vec).
 
 
-## Data2VecConfig
+## Data2VecTextConfig
 
-[[autodoc]] Data2VecConfig
+[[autodoc]] Data2VecTextConfig
 
-## Data2VecForAudioModel
+## Data2VecAudioConfig
 
-[[autodoc]] Data2VecForAudioModel
+[[autodoc]] Data2VecAudioConfig
+
+## Data2VecAudioModel
+
+[[autodoc]] Data2VecAudioModel
+    - forward
+
+
+## Data2VecAudioForAudioFrameClassification
+
+[[autodoc]] Data2VecAudioForAudioFrameClassification
+    - forward
+
+## Data2VecAudioForCTC
+
+[[autodoc]] Data2VecAudioForCTC
+    - forward
+
+## Data2VecAudioForPreTraining
+
+[[autodoc]] Data2VecAudioForPreTraining
+    - forward
+
+## Data2VecAudioForSequenceClassification
+
+[[autodoc]] Data2VecAudioForSequenceClassification
+    - forward
+
+## Data2VecAudioForXVector
+
+[[autodoc]] Data2VecAudioForXVector
     - forward
 
-## Data2VecForTextModel
+## Data2VecTextModel
 
-[[autodoc]] Data2VecForTextModel
+[[autodoc]] Data2VecTextModel
     - forward
 
-## Data2VecForCausalLM
+## Data2VecTextForCausalLM
 
-[[autodoc]] Data2VecForCausalLM
+[[autodoc]] Data2VecTextForCausalLM
     - forward
 
-## Data2VecForMaskedLM
+## Data2VecTextForMaskedLM
 
-[[autodoc]] Data2VecForMaskedLM
+[[autodoc]] Data2VecTextForMaskedLM
     - forward
 
-## Data2VecForSequenceClassification
+## Data2VecTextForSequenceClassification
 
-[[autodoc]] Data2VecForSequenceClassification
+[[autodoc]] Data2VecTextForSequenceClassification
     - forward
 
-## Data2VecForMultipleChoice
+## Data2VecTextForMultipleChoice
 
-[[autodoc]] Data2VecForMultipleChoice
+[[autodoc]] Data2VecTextForMultipleChoice
     - forward
 
-## Data2VecForTokenClassification
+## Data2VecTextForTokenClassification
 
-[[autodoc]] Data2VecForTokenClassification
+[[autodoc]] Data2VecTextForTokenClassification
     - forward
 
-## Data2VecForQuestionAnswering
+## Data2VecTextForQuestionAnswering
 
-[[autodoc]] Data2VecForQuestionAnswering
+[[autodoc]] Data2VecTextForQuestionAnswering
     - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 66f215a23e6f9..41da5f5f663fe 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -203,7 +203,7 @@
     "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
     "models.cpm": ["CpmTokenizer"],
     "models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
-    "models.data2vec": ["DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecConfig"],
+    "models.data2vec": ["DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecTextConfig"],
     "models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"],
     "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
     "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
@@ -850,15 +850,21 @@
     _import_structure["models.data2vec"].extend(
         [
             "DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "Data2VecForAudioModel",
-            "Data2VecForCausalLM",
-            "Data2VecForMaskedLM",
-            "Data2VecForMultipleChoice",
-            "Data2VecForQuestionAnswering",
-            "Data2VecForSequenceClassification",
-            "Data2VecForTextModel",
-            "Data2VecForTokenClassification",
-            "Data2VecPreTrainedModel",
+            "Data2VecAudioForAudioFrameClassification",
+            "Data2VecAudioForCTC",
+            "Data2VecAudioForPreTraining",
+            "Data2VecAudioForSequenceClassification",
+            "Data2VecAudioForXVector",
+            "Data2VecAudioModel",
+            "Data2VecAudioPreTrainedModel",
+            "Data2VecTextForCausalLM",
+            "Data2VecTextForMaskedLM",
+            "Data2VecTextForMultipleChoice",
+            "Data2VecTextForQuestionAnswering",
+            "Data2VecTextForSequenceClassification",
+            "Data2VecTextForTokenClassification",
+            "Data2VecTextModel",
+            "Data2VecTextPreTrainedModel",
         ]
     )
     _import_structure["models.deberta"].extend(
@@ -2408,7 +2414,7 @@
     from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
     from .models.cpm import CpmTokenizer
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
-    from .models.data2vec import DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecConfig
+    from .models.data2vec import DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecTextConfig
     from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer
     from .models.deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
@@ -2946,15 +2952,21 @@
         )
         from .models.data2vec import (
             DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
-            Data2VecForAudioModel,
-            Data2VecForCausalLM,
-            Data2VecForMaskedLM,
-            Data2VecForMultipleChoice,
-            Data2VecForQuestionAnswering,
-            Data2VecForSequenceClassification,
-            Data2VecForTextModel,
-            Data2VecForTokenClassification,
-            Data2VecPreTrainedModel,
+            Data2VecAudioForAudioFrameClassification,
+            Data2VecAudioForCTC,
+            Data2VecAudioForPreTraining,
+            Data2VecAudioForSequenceClassification,
+            Data2VecAudioForXVector,
+            Data2VecAudioModel,
+            Data2VecAudioPreTrainedModel,
+            Data2VecTextForCausalLM,
+            Data2VecTextForMaskedLM,
+            Data2VecTextForMultipleChoice,
+            Data2VecTextForQuestionAnswering,
+            Data2VecTextForSequenceClassification,
+            Data2VecTextForTokenClassification,
+            Data2VecTextModel,
+            Data2VecTextPreTrainedModel,
         )
         from .models.deberta import (
             DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index c052d4c08fc40..60036b33234f6 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -88,7 +88,7 @@
         ("reformer", "ReformerConfig"),
         ("longformer", "LongformerConfig"),
         ("roberta", "RobertaConfig"),
-        ("data2vec", "Data2VecConfig"),
+        ("data2vec", "Data2VecTextConfig"),
         ("deberta-v2", "DebertaV2Config"),
         ("deberta", "DebertaConfig"),
         ("flaubert", "FlaubertConfig"),
@@ -172,7 +172,7 @@
         ("xlnet", "XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xlm", "XLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("roberta", "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("data2vec", "DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("data2vec", "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("distilbert", "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 82b7ab2ac0a83..528a02ba7796f 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -80,7 +80,7 @@
         ("bart", "BartModel"),
         ("longformer", "LongformerModel"),
         ("roberta", "RobertaModel"),
-        ("data2vec", "Data2VecForTextModel"),
+        ("data2vec", "Data2VecTextModel"),
         ("layoutlm", "LayoutLMModel"),
         ("squeezebert", "SqueezeBertModel"),
         ("bert", "BertModel"),
@@ -131,7 +131,7 @@
         ("fsmt", "FSMTForConditionalGeneration"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
-        ("data2vec", "Data2VecForMaskedLM"),
+        ("data2vec", "Data2VecTextForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForPreTraining"),
         ("big_bird", "BigBirdForPreTraining"),
@@ -189,7 +189,7 @@
         ("bart", "BartForConditionalGeneration"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
-        ("data2vec", "Data2VecForMaskedLM"),
+        ("data2vec", "Data2VecTextForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForMaskedLM"),
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
@@ -250,6 +250,7 @@
         ("blenderbot-small", "BlenderbotSmallForCausalLM"),
         ("megatron-bert", "MegatronBertForCausalLM"),
         ("speech_to_text_2", "Speech2Text2ForCausalLM"),
+        ("data2vec", "Data2VecTextForCausalLM"),
     ]
 )
 
@@ -316,7 +317,7 @@
         ("xlm-roberta", "XLMRobertaForMaskedLM"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
-        ("data2vec", "Data2VecForMaskedLM"),
+        ("data2vec", "Data2VecTextForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForMaskedLM"),
         ("megatron-bert", "MegatronBertForMaskedLM"),
@@ -395,7 +396,7 @@
         ("bart", "BartForSequenceClassification"),
         ("longformer", "LongformerForSequenceClassification"),
         ("roberta", "RobertaForSequenceClassification"),
-        ("data2vec", "Data2VecForSequenceClassification"),
+        ("data2vec", "Data2VecTextForSequenceClassification"),
         ("squeezebert", "SqueezeBertForSequenceClassification"),
         ("layoutlm", "LayoutLMForSequenceClassification"),
         ("bert", "BertForSequenceClassification"),
@@ -462,6 +463,7 @@
         ("deberta-v2", "DebertaV2ForQuestionAnswering"),
         ("ibert", "IBertForQuestionAnswering"),
         ("splinter", "SplinterForQuestionAnswering"),
+        ("data2vec", "Data2VecTextForQuestionAnswering"),
     ]
 )
 
@@ -508,6 +510,7 @@
         ("deberta-v2", "DebertaV2ForTokenClassification"),
         ("gpt2", "GPT2ForTokenClassification"),
         ("ibert", "IBertForTokenClassification"),
+        ("data2vec", "Data2VecTextForTokenClassification"),
     ]
 )
 
@@ -529,7 +532,7 @@
         ("xlm-roberta", "XLMRobertaForMultipleChoice"),
         ("longformer", "LongformerForMultipleChoice"),
         ("roberta", "RobertaForMultipleChoice"),
-        ("data2vec", "Data2VecForMultipleChoice"),
+        ("data2vec", "Data2VecTextForMultipleChoice"),
         ("squeezebert", "SqueezeBertForMultipleChoice"),
         ("bert", "BertForMultipleChoice"),
         ("distilbert", "DistilBertForMultipleChoice"),
@@ -565,6 +568,7 @@
         ("sew", "SEWForSequenceClassification"),
         ("sew-d", "SEWDForSequenceClassification"),
         ("wavlm", "WavLMForSequenceClassification"),
+        ("data2vec", "Data2VecAudioForSequenceClassification"),
     ]
 )
 
@@ -578,6 +582,7 @@
         ("sew", "SEWForCTC"),
         ("sew-d", "SEWDForCTC"),
         ("wavlm", "WavLMForCTC"),
+        ("data2vec", "Data2VecAudioForCTC"),
     ]
 )
 
@@ -587,6 +592,7 @@
         ("wav2vec2", "Wav2Vec2ForAudioFrameClassification"),
         ("unispeech-sat", "UniSpeechSatForAudioFrameClassification"),
         ("wavlm", "WavLMForAudioFrameClassification"),
+        ("data2vec", "Data2VecAudioForAudioFrameClassification"),
     ]
 )
 
@@ -596,6 +602,7 @@
         ("wav2vec2", "Wav2Vec2ForXVector"),
         ("unispeech-sat", "UniSpeechSatForXVector"),
         ("wavlm", "WavLMForXVector"),
+        ("data2vec", "Data2VecAudioForXVector"),
     ]
 )
 
diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
index 1abf2a9f9efac..8abc7d3def191 100644
--- a/src/transformers/models/data2vec/__init__.py
+++ b/src/transformers/models/data2vec/__init__.py
@@ -22,53 +22,71 @@
 
 
 _import_structure = {
-    "configuration_data2vec_text": ["DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecTextConfig", "Data2VecTextOnnxConfig"],
-    "configuration_data2vec_audio": ["DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecAudioConfig", "Data2VecAudioOnnxConfig"],
+    "configuration_data2vec": [
+        "DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "Data2VecAudioConfig",
+        "Data2VecAudioOnnxConfig",
+        "Data2VecTextConfig",
+        "Data2VecTextOnnxConfig",
+    ],
 }
 
 if is_torch_available():
+    _import_structure["modeling_data2vec_audio"] = [
+        "DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Data2VecAudioForAudioFrameClassification",
+        "Data2VecAudioForCTC",
+        "Data2VecAudioForPreTraining",
+        "Data2VecAudioForSequenceClassification",
+        "Data2VecAudioForXVector",
+        "Data2VecAudioModel",
+        "Data2VecAudioPreTrainedModel",
+    ]
     _import_structure["modeling_data2vec_text"] = [
         "DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "Data2VecTextModel",
         "Data2VecTextForCausalLM",
         "Data2VecTextForMaskedLM",
         "Data2VecTextForMultipleChoice",
         "Data2VecTextForQuestionAnswering",
         "Data2VecTextForSequenceClassification",
-        "Data2VecTextForTextModel",
         "Data2VecTextForTokenClassification",
+        "Data2VecTextModel",
         "Data2VecTextPreTrainedModel",
     ]
-    _import_structure["modeling_data2vec_audio"] = [
-        "DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "Data2VecAudioModel",
-        "Data2VecAudioPreTrainedModel",
-        "Data2VecAudioForCTC",
-    ]
 
 if TYPE_CHECKING:
-    from .configuration_data2vec_text import DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecTextConfig, Data2VecTextOnnxConfig
-    from .configuration_data2vec_audio import DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecAudioConfig, Data2VecAudioOnnxConfig
+    from .configuration_data2vec import (
+        DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        Data2VecAudioConfig,
+        Data2VecAudioOnnxConfig,
+        Data2VecTextConfig,
+        Data2VecTextOnnxConfig,
+    )
 
     if is_torch_available():
+        from .modeling_data2vec_audio import (
+            DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Data2VecAudioForAudioFrameClassification,
+            Data2VecAudioForCTC,
+            Data2VecAudioForPreTraining,
+            Data2VecAudioForSequenceClassification,
+            Data2VecAudioForXVector,
+            Data2VecAudioModel,
+            Data2VecAudioPreTrainedModel,
+        )
         from .modeling_data2vec_text import (
             DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
-            Data2VecTextForAudioModel,
             Data2VecTextForCausalLM,
             Data2VecTextForMaskedLM,
             Data2VecTextForMultipleChoice,
             Data2VecTextForQuestionAnswering,
             Data2VecTextForSequenceClassification,
-            Data2VecTextForTextModel,
             Data2VecTextForTokenClassification,
+            Data2VecTextModel,
             Data2VecTextPreTrainedModel,
         )
-        from .modeling_data2vec_audio import (
-            DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST,
-            Data2VecAudioModel,
-            Data2VecAudioPreTrainedModel,
-            Data2VecAudioForCTC,
-        )
 
 else:
     import sys
diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec.py
similarity index 71%
rename from src/transformers/models/data2vec/configuration_data2vec_audio.py
rename to src/transformers/models/data2vec/configuration_data2vec.py
index b2d1856a63b5e..891a7c6293ac0 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_audio.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,25 +12,151 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Data2VecAudio model configuration"""
+""" Data2VecText configuration"""
+from collections import OrderedDict
+from typing import Mapping
 
 from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
 from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
 
-DATA_2_VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "edugp/data2vec-nlp-base": "https://huggingface.co/data2vec/resolve/main/config.json",
+}
+
+DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "facebook/data2vec-base-960h": "https://huggingface.co/facebook/data2vec-audio-base-960h/resolve/main/config.json",
     # See all Data2VecAudio models at https://huggingface.co/models?filter=data2vec-audio
 }
 
 
+class Data2VecTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Data2VecTextModel`] and [`Data2VecTextModel`]. It
+    is used to instantiate a Data2VecText model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2VecText
+    [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import Data2VecTextModel, Data2VecTextConfig
+
+    >>> # Initializing a Data2VecText edugp/data2vec-nlp-base style configuration
+    >>> configuration = Data2VecTextConfig()
+
+    >>> # Initializing a model from the edugp/data2vec-nlp-base style configuration
+    >>> model = Data2VecTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "data2vec_text"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+class Data2VecTextOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+
 class Data2VecAudioConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Data2VecAudioModel`]. It is used to instantiate an
-    Data2VecAudio model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the Data2VecAudio
+    This is the configuration class to store the configuration of a [`Data2VecAudioModel`]. It is used to instantiate
+    an Data2VecAudio model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Data2VecAudio
     [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -39,10 +165,10 @@ class Data2VecAudioConfig(PretrainedConfig):
 
     Args:
         vocab_size (`int`, *optional*, defaults to 32):
-            Vocabulary size of the Data2VecAudio model. Defines the number of different tokens that can be represented by
-            the `inputs_ids` passed when calling [`Data2VecAudioModel`] or [`TFData2VecAudioModel`]. Vocabulary size of the
-            model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward
-            method of [`Data2VecAudioModel`].
+            Vocabulary size of the Data2VecAudio model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`Data2VecAudioModel`] or [`TFData2VecAudioModel`]. Vocabulary size
+            of the model. Defines the different tokens that can be represented by the *inputs_ids* passed to the
+            forward method of [`Data2VecAudioModel`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -156,8 +282,8 @@ class Data2VecAudioConfig(PretrainedConfig):
         xvector_output_dim (`int`, *optional*, defaults to 512):
             Dimensionality of the *XVector* embedding vectors.
         add_adapter (`bool`, *optional*, defaults to `False`):
-            Whether a convolutional network should be stacked on top of the Data2VecAudio Encoder. Can be very useful for
-            warm-starting Data2VecAudio for SpeechEncoderDecoder models.
+            Whether a convolutional network should be stacked on top of the Data2VecAudio Encoder. Can be very useful
+            for warm-starting Data2VecAudio for SpeechEncoderDecoder models.
         adapter_kernel_size (`int`, *optional*, defaults to 3):
             Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
         adapter_stride (`int`, *optional*, defaults to 2):
diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py
deleted file mode 100644
index a6843c650cf1c..0000000000000
--- a/src/transformers/models/data2vec/configuration_data2vec_text.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Data2VecText configuration"""
-from collections import OrderedDict
-from typing import Mapping
-
-from ...configuration_utils import PretrainedConfig
-from ...onnx import OnnxConfig
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "edugp/data2vec-nlp-base": "https://huggingface.co/data2vec/resolve/main/config.json",
-}
-
-
-class Data2VecTextConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Data2VecTextForTextModel`] and
-    [`Data2VecTextForAudioModel`]. It is used to instantiate a Data2VecText model according to the specified arguments,
-    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the Data2VecText [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 30522):
-            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"silu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the attention probabilities.
-        max_position_embeddings (`int`, *optional*, defaults to 512):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the layer normalization layers.
-        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
-            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
-            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
-            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
-            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
-            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        classifier_dropout (`float`, *optional*):
-            The dropout ratio for the classification head.
-
-    Examples:
-
-    ```python
-    >>> from transformers import Data2VecTextModel, Data2VecTextConfig
-
-    >>> # Initializing a Data2VecText edugp/data2vec-nlp-base style configuration
-    >>> configuration = Data2VecTextConfig()
-
-    >>> # Initializing a model from the edugp/data2vec-nlp-base style configuration
-    >>> model = Data2VecTextModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "data2vec_text"
-
-    def __init__(
-        self,
-        vocab_size=30522,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
-        position_embedding_type="absolute",
-        use_cache=True,
-        classifier_dropout=None,
-        **kwargs
-    ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.position_embedding_type = position_embedding_type
-        self.use_cache = use_cache
-        self.classifier_dropout = classifier_dropout
-
-
-class Data2VecTextOnnxConfig(OnnxConfig):
-    @property
-    def inputs(self) -> Mapping[str, Mapping[int, str]]:
-        return OrderedDict(
-            [
-                ("input_ids", {0: "batch", 1: "sequence"}),
-                ("attention_mask", {0: "batch", 1: "sequence"}),
-            ]
-        )
diff --git a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
index 9445afe54049c..4ffdb27c082d8 100644
--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@@ -17,19 +17,19 @@
 
 import argparse
 import os
-
-import torch
 from functools import reduce
 
-from transformers import Wav2Vec2Processor, logging
 import fairseq
+import torch
 from datasets import load_dataset
 
+from transformers import Wav2Vec2Processor, logging
+from transformers.models.data2vec.configuration_data2vec import Data2VecAudioConfig
+
 # Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
 from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel as Dummy  # noqa: F401
+from transformers.models.data2vec.modeling_data2vec_audio import Data2VecAudioForCTC, Data2VecAudioModel
 
-from transformers.models.data2vec.modeling_data2vec_audio import Data2VecAudioModel, Data2VecAudioForCTC
-from transformers.models.data2vec.configuration_data2vec_audio import Data2VecAudioConfig
 
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
@@ -151,9 +151,7 @@ def set_weights(full_name, module, fsq_value, hf_weight_path):
     hf_value = hf_weight.data
 
     if fsq_value.shape != hf_value.shape:
-        raise ValueError(
-            f"{full_name} has size {fsq_value.shape}, but {hf_value.shape} was found."
-        )
+        raise ValueError(f"{full_name} has size {fsq_value.shape}, but {hf_value.shape} was found.")
     hf_weight.data = fsq_value
     logger.info(f"{full_name} was correctly initialized from {hf_weight_path}.")
 
@@ -235,15 +233,15 @@ def load_data2vec(path):
 
     input_values = inputs.input_values
     attention_mask = inputs.attention_mask
-#    input_values = inputs.input_values[:, :-1]
-#    attention_mask = inputs.attention_mask[:, :-1]
+    #    input_values = inputs.input_values[:, :-1]
+    #    attention_mask = inputs.attention_mask[:, :-1]
 
     hf_wav2vec.eval()
     model.eval()
     if is_finetuned:
-        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)["encoder_out"].transpose(
-            0, 1
-        )
+        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)[
+            "encoder_out"
+        ].transpose(0, 1)
         our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["logits"]
 
         pred_ids = torch.argmax(our_output, dim=-1)
@@ -251,9 +249,9 @@ def load_data2vec(path):
 
         print(f"Expected Output: {ds[:4]['text']}, Pred: {output_string}")
     else:
-        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)["layer_results"][-1][0].transpose(
-            0, 1
-        )
+        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)[
+            "layer_results"
+        ][-1][0].transpose(0, 1)
         our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["last_hidden_state"]
 
     print(our_output.shape, their_output.shape)
diff --git a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
index 4173bc3085c5e..5dff263d7ab55 100644
--- a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
@@ -24,7 +24,7 @@
 from fairseq.modules import TransformerSentenceEncoderLayer
 from packaging import version
 
-from transformers import Data2VecConfig, Data2VecForMaskedLM, Data2VecForSequenceClassification
+from transformers import Data2VecTextConfig, Data2VecTextForMaskedLM, Data2VecTextForSequenceClassification
 from transformers.models.bert.modeling_bert import (
     BertIntermediate,
     BertLayer,
@@ -61,7 +61,7 @@ def convert_data2vec_checkpoint_to_pytorch(
     data2vec.eval()  # disable dropout
     data2vec_model = data2vec.models[0]
     data2vec_sent_encoder = data2vec_model.encoder.sentence_encoder
-    config = Data2VecConfig(
+    config = Data2VecTextConfig(
         vocab_size=data2vec_sent_encoder.embed_tokens.num_embeddings,
         hidden_size=data2vec_model.args.encoder_embed_dim,
         num_hidden_layers=data2vec_model.args.encoder_layers,
@@ -75,7 +75,7 @@ def convert_data2vec_checkpoint_to_pytorch(
         config.num_labels = data2vec.model.classification_heads["mnli"].out_proj.weight.shape[0]
     print("Our BERT config:", config)
 
-    model = Data2VecForSequenceClassification(config) if classification_head else Data2VecForMaskedLM(config)
+    model = Data2VecTextForSequenceClassification(config) if classification_head else Data2VecTextForMaskedLM(config)
     model.eval()
 
     # Now let's copy all the weights.
@@ -180,7 +180,7 @@ def convert_data2vec_checkpoint_to_pytorch(
     parser = argparse.ArgumentParser()
     # Required parameters
     parser.add_argument(
-        "--data2vec_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
+        "--checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
     )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
@@ -190,5 +190,5 @@ def convert_data2vec_checkpoint_to_pytorch(
     )
     args = parser.parse_args()
     convert_data2vec_checkpoint_to_pytorch(
-        args.data2vec_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
+        args.checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
     )
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 933458d2b5d4d..5fd4cd29688c5 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -33,15 +33,10 @@
     add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
-from ...modeling_outputs import (
-    BaseModelOutput,
-    CausalLMOutput,
-    SequenceClassifierOutput,
-    TokenClassifierOutput,
-)
+from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput
 from ...modeling_utils import PreTrainedModel, torch_int_div
 from ...utils import logging
-from .configuration_data2vec_audio import Data2VecAudioConfig
+from .configuration_data2vec import Data2VecAudioConfig
 
 
 logger = logging.get_logger(__name__)
@@ -76,7 +71,7 @@
 _XVECTOR_EXPECTED_OUTPUT = 0.98
 
 
-DATA_2_VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST = [
+DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "facebook/data2vec-audio-base-960h",
     "facebook/data2vec-audio-large-960h",
     "facebook/data2vec-audio-large-960h-lv60",
@@ -411,7 +406,9 @@ def forward(self, hidden_states):
 class Data2VecAudioPositionalConvEmbedding(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.layers = nn.ModuleList([Data2VecAudioPositionalConvLayer(config) for _ in range(config.num_conv_pos_embeddings)])
+        self.layers = nn.ModuleList(
+            [Data2VecAudioPositionalConvLayer(config) for _ in range(config.num_conv_pos_embeddings)]
+        )
 
     def forward(self, hidden_states):
         hidden_states = hidden_states.transpose(1, 2)
@@ -426,9 +423,9 @@ class Data2VecAudioFeatureEncoder(nn.Module):
 
     def __init__(self, config):
         super().__init__()
-        self.conv_layers = nn.ModuleList([
-            Data2VecAudioConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
-        ])
+        self.conv_layers = nn.ModuleList(
+            [Data2VecAudioConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
+        )
         self.gradient_checkpointing = False
         self._requires_grad = True
 
@@ -930,7 +927,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             module.gradient_checkpointing = value
 
 
-DATA_2_VEC_AUDIO_START_DOCSTRING = r"""
+DATA2VEC_AUDIO_START_DOCSTRING = r"""
     Data2VecAudio was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
     Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
     Auli.
@@ -949,7 +946,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-DATA_2_VEC_AUDIO_INPUTS_DOCSTRING = r"""
+DATA2VEC_AUDIO_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
             Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
@@ -969,10 +966,10 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
             True`. For all models whose processor has `config.return_attention_mask == False`, such as
-            [data2vec-audio-base](https://huggingface.co/facebook/data2vec-audio-base-960h), `attention_mask` should **not** be
-            passed to avoid degraded performance when doing batched inference. For such models `input_values` should
-            simply be padded with 0 and passed without `attention_mask`. Be aware that these models also yield slightly
-            different results depending on whether `input_values` is padded or not.
+            [data2vec-audio-base](https://huggingface.co/facebook/data2vec-audio-base-960h), `attention_mask` should
+            **not** be passed to avoid degraded performance when doing batched inference. For such models
+            `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these
+            models also yield slightly different results depending on whether `input_values` is padded or not.
 
             </Tip>
 
@@ -989,7 +986,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 @add_start_docstrings(
     "The bare Data2VecAudio Model transformer outputting raw hidden-states without any specific head on top.",
-    DATA_2_VEC_AUDIO_START_DOCSTRING,
+    DATA2VEC_AUDIO_START_DOCSTRING,
 )
 class Data2VecAudioModel(Data2VecAudioPreTrainedModel):
     def __init__(self, config: Data2VecAudioConfig):
@@ -1062,7 +1059,7 @@ def _mask_hidden_states(
 
         return hidden_states
 
-    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1124,7 +1121,7 @@ def forward(
         )
 
 
-@add_start_docstrings("""Data2VecAudio Model with a quantizer and `VQ` head on top.""", DATA_2_VEC_AUDIO_START_DOCSTRING)
+@add_start_docstrings("""Data2VecAudio Model with a quantizer and `VQ` head on top.""", DATA2VEC_AUDIO_START_DOCSTRING)
 class Data2VecAudioForPreTraining(Data2VecAudioPreTrainedModel):
     def __init__(self, config: Data2VecAudioConfig):
         super().__init__(config)
@@ -1172,7 +1169,7 @@ def compute_contrastive_logits(
         logits = logits / temperature
         return logits
 
-    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Data2VecAudioForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1199,7 +1196,7 @@ def forward(
         ```python
         >>> import torch
         >>> from transformers import Wav2Vec2FeatureExtractor, Data2VecAudioForPreTraining
-        >>> from transformers.models.data2vec-audio.modeling_data2vec-audio import _compute_mask_indices
+        >>> from transformers.models.data2vec.modeling_data2vec import _compute_mask_indices
         >>> from datasets import load_dataset
         >>> import soundfile as sf
 
@@ -1333,7 +1330,7 @@ def forward(
 
 @add_start_docstrings(
     """Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
-    DATA_2_VEC_AUDIO_START_DOCSTRING,
+    DATA2VEC_AUDIO_START_DOCSTRING,
 )
 class Data2VecAudioForCTC(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
@@ -1361,7 +1358,7 @@ def freeze_feature_encoder(self):
         """
         self.data2vec_audio.feature_extractor._freeze_parameters()
 
-    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1445,10 +1442,10 @@ def forward(
 
 @add_start_docstrings(
     """
-    Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
-    SUPERB Keyword Spotting.
+    Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks
+    like SUPERB Keyword Spotting.
     """,
-    DATA_2_VEC_AUDIO_START_DOCSTRING,
+    DATA2VEC_AUDIO_START_DOCSTRING,
 )
 class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
@@ -1479,7 +1476,7 @@ def freeze_base_model(self):
         for param in self.data2vec_audio.parameters():
             param.requires_grad = False
 
-    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_SEQ_CLASS_CHECKPOINT,
@@ -1555,7 +1552,7 @@ def forward(
     """
     Data2VecAudio Model with a frame classification head on top for tasks like Speaker Diarization.
     """,
-    DATA_2_VEC_AUDIO_START_DOCSTRING,
+    DATA2VEC_AUDIO_START_DOCSTRING,
 )
 class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
@@ -1584,7 +1581,7 @@ def freeze_base_model(self):
         for param in self.data2vec_audio.parameters():
             param.requires_grad = False
 
-    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_FRAME_CLASS_CHECKPOINT,
@@ -1694,7 +1691,7 @@ def forward(self, hidden_states):
     """
     Data2VecAudio Model with an XVector feature extraction head on top for tasks like Speaker Verification.
     """,
-    DATA_2_VEC_AUDIO_START_DOCSTRING,
+    DATA2VEC_AUDIO_START_DOCSTRING,
 )
 class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
@@ -1746,7 +1743,7 @@ def _conv_out_length(input_length, kernel_size, stride):
 
         return input_lengths
 
-    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_XVECTOR_CHECKPOINT,
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 75ba37c2a4152..6bc514bcc0dbe 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -586,7 +586,6 @@ def forward(self, hidden_states):
         return pooled_output
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->Data2VecText, roberta->data2vec-text
 class Data2VecTextPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -1259,8 +1258,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    Data2VecText Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
-    softmax) e.g. for RocStories/SWAG tasks.
+    Data2VecText Model with a multiple choice classification head on top (a linear layer on top of the pooled output
+    and a softmax) e.g. for RocStories/SWAG tasks.
     """,
     DATA2VECTEXT_START_DOCSTRING,
 )
@@ -1354,8 +1353,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    Data2VecText Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
+    Data2VecText Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
     """,
     DATA2VECTEXT_START_DOCSTRING,
 )
@@ -1462,8 +1461,8 @@ def forward(self, features, **kwargs):
 
 @add_start_docstrings(
     """
-    Data2VecText Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    Data2VecText Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
     """,
     DATA2VECTEXT_START_DOCSTRING,
 )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 9012f4b79c74d..ceeebdd50df44 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1142,63 +1142,105 @@ def __init__(self, *args, **kwargs):
 DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class Data2VecForAudioModel(metaclass=DummyObject):
+class Data2VecAudioForAudioFrameClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForCausalLM(metaclass=DummyObject):
+class Data2VecAudioForCTC(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForMaskedLM(metaclass=DummyObject):
+class Data2VecAudioForPreTraining(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForMultipleChoice(metaclass=DummyObject):
+class Data2VecAudioForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForQuestionAnswering(metaclass=DummyObject):
+class Data2VecAudioForXVector(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForSequenceClassification(metaclass=DummyObject):
+class Data2VecAudioModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForTextModel(metaclass=DummyObject):
+class Data2VecAudioPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForTokenClassification(metaclass=DummyObject):
+class Data2VecTextForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecPreTrainedModel(metaclass=DummyObject):
+class Data2VecTextForMaskedLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecTextForMultipleChoice(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecTextForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecTextForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecTextForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecTextModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecTextPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/test_modeling_data2vec.py b/tests/test_modeling_data2vec.py
index 3426ed9b86507..03d97f222fd8d 100644
--- a/tests/test_modeling_data2vec.py
+++ b/tests/test_modeling_data2vec.py
@@ -12,522 +12,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
-import unittest
-
-from transformers import Data2VecConfig, is_torch_available
-from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
-
-from .test_configuration_common import ConfigTester
-from .test_generation_utils import GenerationTesterMixin
-from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        Data2VecForCausalLM,
-        Data2VecForMaskedLM,
-        Data2VecForMultipleChoice,
-        Data2VecForQuestionAnswering,
-        Data2VecForSequenceClassification,
-        Data2VecForTextModel,
-        Data2VecForTokenClassification,
-    )
-    from transformers.models.data2vec.modeling_data2vec import (
-        DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
-        Data2VecForTextEmbeddings,
-        create_position_ids_from_input_ids,
-    )
-
-
-class Data2VecModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 5
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return Data2VecConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecForTextModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Data2VecForTextModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Data2VecForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Data2VecForCausalLM(config=config).to(torch_device).eval()
-
-        # make sure that ids don't start with pad token
-        mask = input_ids.ne(config.pad_token_id).long()
-        input_ids = input_ids * mask
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-
-        # make sure that ids don't start with pad token
-        mask = next_tokens.ne(config.pad_token_id).long()
-        next_tokens = next_tokens * mask
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = Data2VecForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = Data2VecForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class Data2VecModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (
-            Data2VecForCausalLM,
-            Data2VecForMaskedLM,
-            Data2VecForTextModel,
-            Data2VecForSequenceClassification,
-            Data2VecForTokenClassification,
-            Data2VecForMultipleChoice,
-            Data2VecForQuestionAnswering,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (Data2VecForCausalLM,) if is_torch_available() else ()
-
-    def setUp(self):
-        self.model_tester = Data2VecModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Data2VecConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = Data2VecForTextModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    def test_create_position_ids_respects_padding_index(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecForTextEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = Data2VecForTextEmbeddings(config=config)
-
-        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = torch.as_tensor(
-            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
-        )
-
-        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-    def test_create_position_ids_from_inputs_embeds(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecForTextEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = Data2VecForTextEmbeddings(config=config)
-
-        inputs_embeds = torch.empty(2, 4, 30)
-        expected_single_positions = [
-            0 + embeddings.padding_idx + 1,
-            1 + embeddings.padding_idx + 1,
-            2 + embeddings.padding_idx + 1,
-            3 + embeddings.padding_idx + 1,
-        ]
-        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
-        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-
-@require_torch
-class Data2VecModelIntegrationTest(TestCasePlus):
-    @slow
-    def test_inference_masked_lm(self):
-        model = Data2VecForMaskedLM.from_pretrained("data2vec")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
-        )
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
-        # data2vec.eval()
-        # expected_slice = data2vec.model.forward(input_ids)[0][:, :3, :3].detach()
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_no_head(self):
-        model = Data2VecForTextModel.from_pretrained("data2vec")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
-        )
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
-        # data2vec.eval()
-        # expected_slice = data2vec.extract_features(input_ids)[:, :3, :3].detach()
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_classification_head(self):
-        model = Data2VecForSequenceClassification.from_pretrained("data2vec-large-mnli")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 3))
-        self.assertEqual(output.shape, expected_shape)
-        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.large.mnli')
-        # data2vec.eval()
-        # expected_tensor = data2vec.predict("mnli", input_ids, return_logits=True).detach()
-
-        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
diff --git a/tests/test_modeling_data2vec_audio.py b/tests/test_modeling_data2vec_audio.py
new file mode 100644
index 0000000000000..03d97f222fd8d
--- /dev/null
+++ b/tests/test_modeling_data2vec_audio.py
@@ -0,0 +1,14 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/test_modeling_data2vec_text.py b/tests/test_modeling_data2vec_text.py
new file mode 100644
index 0000000000000..ffdc88fbd8f74
--- /dev/null
+++ b/tests/test_modeling_data2vec_text.py
@@ -0,0 +1,534 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Data2VecText model. """
+
+
+import unittest
+
+from transformers import Data2VecTextConfig, is_torch_available
+from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Data2VecTextForCausalLM,
+        Data2VecTextForMaskedLM,
+        Data2VecTextForMultipleChoice,
+        Data2VecTextForQuestionAnswering,
+        Data2VecTextForSequenceClassification,
+        Data2VecTextForTokenClassification,
+        Data2VecTextModel,
+    )
+    from transformers.models.data2vec.modeling_data2vec import (
+        DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
+        Data2VecTextForTextEmbeddings,
+        create_position_ids_from_input_ids,
+    )
+
+
+class Data2VecModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return Data2VecTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = Data2VecTextModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = Data2VecTextForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = Data2VecTextForCausalLM(config=config).to(torch_device).eval()
+
+        # make sure that ids don't start with pad token
+        mask = input_ids.ne(config.pad_token_id).long()
+        input_ids = input_ids * mask
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+
+        # make sure that ids don't start with pad token
+        mask = next_tokens.ne(config.pad_token_id).long()
+        next_tokens = next_tokens * mask
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = Data2VecTextForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = Data2VecTextForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Data2VecModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            Data2VecTextForCausalLM,
+            Data2VecTextForMaskedLM,
+            Data2VecTextModel,
+            Data2VecTextForSequenceClassification,
+            Data2VecTextForTokenClassification,
+            Data2VecTextForMultipleChoice,
+            Data2VecTextForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (Data2VecTextForCausalLM,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = Data2VecModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Data2VecTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = Data2VecTextModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_create_position_ids_respects_padding_index(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = Data2VecTextForTextEmbeddings(config=config)
+
+        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
+        expected_positions = torch.as_tensor(
+            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
+        )
+
+        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+    def test_create_position_ids_from_inputs_embeds(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        embeddings = Data2VecTextForTextEmbeddings(config=config)
+
+        inputs_embeds = torch.empty(2, 4, 30)
+        expected_single_positions = [
+            0 + embeddings.padding_idx + 1,
+            1 + embeddings.padding_idx + 1,
+            2 + embeddings.padding_idx + 1,
+            3 + embeddings.padding_idx + 1,
+        ]
+        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
+        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+
+@require_torch
+class Data2VecModelIntegrationTest(TestCasePlus):
+    @slow
+    def test_inference_masked_lm(self):
+        model = Data2VecTextForMaskedLM.from_pretrained("data2vec")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 50265))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
+        )
+
+        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
+        # data2vec.eval()
+        # expected_slice = data2vec.model.forward(input_ids)[0][:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_no_head(self):
+        model = Data2VecTextModel.from_pretrained("data2vec")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
+        )
+
+        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
+        # data2vec.eval()
+        # expected_slice = data2vec.extract_features(input_ids)[:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_classification_head(self):
+        model = Data2VecTextForSequenceClassification.from_pretrained("data2vec-large-mnli")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 3))
+        self.assertEqual(output.shape, expected_shape)
+        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
+
+        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.large.mnli')
+        # data2vec.eval()
+        # expected_tensor = data2vec.predict("mnli", input_ids, return_logits=True).detach()
+
+        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
diff --git a/utils/check_repo.py b/utils/check_repo.py
index d0148242faeed..4f532defe012e 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -85,7 +85,7 @@
     "TFRobertaForMultipleChoice",  # TODO: fix
     "TrOCRDecoderWrapper",  # Building part of bigger (tested) model.
     "SeparableConv1D",  # Building part of bigger (tested) model.
-    "Data2VecForAudioModel",  # TODO: Add tests
+    "Data2VecAudioModel",  # TODO: Add tests
 ]
 
 # Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't
@@ -103,6 +103,7 @@
     "test_modeling_xlm_roberta.py",
     "test_modeling_vision_text_dual_encoder.py",
     "test_modeling_flax_vision_text_dual_encoder.py",
+    "test_modeling_data2vec_audio.py",  # TODO: Add tests
 ]
 
 # Update this list for models that are not in any of the auto MODEL_XXX_MAPPING. Being in this list is an exception and
@@ -165,7 +166,8 @@
     "VisualBertForMultipleChoice",
     "TFWav2Vec2ForCTC",
     "TFHubertForCTC",
-    "Data2VecForAudioModel",  # TODO: Fix
+    "Data2VecAudioModel",  # TODO: Fix
+    "Data2VecAudioForPreTraining",  # TODO: Fix
 ]
 
 # This is to make sure the transformers module imported is the one in the repo.
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index e842055d93bbd..dd36390c84442 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -345,6 +345,7 @@ def module_to_test_file(module_fname):
     "tests/sagemaker/test_single_node_gpu.py",  # SageMaker test
     "tests/sagemaker/test_multi_node_model_parallel.py",  # SageMaker test
     "tests/sagemaker/test_multi_node_data_parallel.py",  # SageMaker test
+    "tests/test_modeling_data2vec.py",  # TODO:Remove redundant file
 ]
 
 

From 0d2cf132091d7906065214825f72d795efdb571a Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Tue, 22 Feb 2022 02:27:41 -0500
Subject: [PATCH 042/121] Remove Data2VecAudioForPretraining. Add tests for
 Data2VecAudio, mimicking the Wav2Vec2 test suite. Fix bias initilization in
 positional conv layers. Move back configurations for audio and text to
 separate files.

---
 docs/source/model_doc/data2vec.mdx            |    5 -
 src/transformers/__init__.py                  |    6 +-
 src/transformers/models/data2vec/__init__.py  |   15 +-
 .../models/data2vec/configuration_data2vec.py |  432 ------
 .../data2vec/configuration_data2vec_audio.py  |  320 +++++
 .../data2vec/configuration_data2vec_text.py   |  147 ++
 ..._original_pytorch_checkpoint_to_pytorch.py |    2 +-
 .../data2vec/modeling_data2vec_audio.py       |  254 +---
 .../models/data2vec/modeling_data2vec_text.py |    2 +-
 src/transformers/utils/dummy_pt_objects.py    |    7 -
 tests/test_modeling_data2vec_audio.py         | 1273 +++++++++++++++++
 utils/check_repo.py                           |    3 -
 12 files changed, 1753 insertions(+), 713 deletions(-)
 create mode 100644 src/transformers/models/data2vec/configuration_data2vec_audio.py
 create mode 100644 src/transformers/models/data2vec/configuration_data2vec_text.py

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
index 6474ceb4758ca..6c682e03858f7 100644
--- a/docs/source/model_doc/data2vec.mdx
+++ b/docs/source/model_doc/data2vec.mdx
@@ -64,11 +64,6 @@ The original code can be found [here](https://github.com/pytorch/fairseq/tree/ma
 [[autodoc]] Data2VecAudioForCTC
     - forward
 
-## Data2VecAudioForPreTraining
-
-[[autodoc]] Data2VecAudioForPreTraining
-    - forward
-
 ## Data2VecAudioForSequenceClassification
 
 [[autodoc]] Data2VecAudioForSequenceClassification
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 41da5f5f663fe..bcbdcc8cd9c8e 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -203,7 +203,7 @@
     "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
     "models.cpm": ["CpmTokenizer"],
     "models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
-    "models.data2vec": ["DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecTextConfig"],
+    "models.data2vec": ["DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecAudioConfig", "Data2VecTextConfig"],
     "models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"],
     "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
     "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
@@ -852,7 +852,6 @@
             "DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
             "Data2VecAudioForAudioFrameClassification",
             "Data2VecAudioForCTC",
-            "Data2VecAudioForPreTraining",
             "Data2VecAudioForSequenceClassification",
             "Data2VecAudioForXVector",
             "Data2VecAudioModel",
@@ -2414,7 +2413,7 @@
     from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
     from .models.cpm import CpmTokenizer
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
-    from .models.data2vec import DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecTextConfig
+    from .models.data2vec import DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecAudioConfig, Data2VecTextConfig
     from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer
     from .models.deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
@@ -2954,7 +2953,6 @@
             DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
             Data2VecAudioForAudioFrameClassification,
             Data2VecAudioForCTC,
-            Data2VecAudioForPreTraining,
             Data2VecAudioForSequenceClassification,
             Data2VecAudioForXVector,
             Data2VecAudioModel,
diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
index 8abc7d3def191..d67c90a2d1314 100644
--- a/src/transformers/models/data2vec/__init__.py
+++ b/src/transformers/models/data2vec/__init__.py
@@ -22,11 +22,12 @@
 
 
 _import_structure = {
-    "configuration_data2vec": [
+    "configuration_data2vec_audio": [
         "DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Data2VecAudioConfig",
-        "Data2VecAudioOnnxConfig",
+    ],
+    "configuration_data2vec_text": [
+        "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Data2VecTextConfig",
         "Data2VecTextOnnxConfig",
     ],
@@ -37,7 +38,6 @@
         "DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST",
         "Data2VecAudioForAudioFrameClassification",
         "Data2VecAudioForCTC",
-        "Data2VecAudioForPreTraining",
         "Data2VecAudioForSequenceClassification",
         "Data2VecAudioForXVector",
         "Data2VecAudioModel",
@@ -56,11 +56,9 @@
     ]
 
 if TYPE_CHECKING:
-    from .configuration_data2vec import (
-        DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    from .configuration_data2vec_audio import DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecAudioConfig
+    from .configuration_data2vec_text import (
         DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        Data2VecAudioConfig,
-        Data2VecAudioOnnxConfig,
         Data2VecTextConfig,
         Data2VecTextOnnxConfig,
     )
@@ -70,7 +68,6 @@
             DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST,
             Data2VecAudioForAudioFrameClassification,
             Data2VecAudioForCTC,
-            Data2VecAudioForPreTraining,
             Data2VecAudioForSequenceClassification,
             Data2VecAudioForXVector,
             Data2VecAudioModel,
diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index 891a7c6293ac0..b0b4464ad1170 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -12,435 +12,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Data2VecText configuration"""
-from collections import OrderedDict
-from typing import Mapping
-
-from ...configuration_utils import PretrainedConfig
-from ...onnx import OnnxConfig
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "edugp/data2vec-nlp-base": "https://huggingface.co/data2vec/resolve/main/config.json",
-}
-
-DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/data2vec-base-960h": "https://huggingface.co/facebook/data2vec-audio-base-960h/resolve/main/config.json",
-    # See all Data2VecAudio models at https://huggingface.co/models?filter=data2vec-audio
-}
-
-
-class Data2VecTextConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Data2VecTextModel`] and [`Data2VecTextModel`]. It
-    is used to instantiate a Data2VecText model according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2VecText
-    [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 30522):
-            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"silu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the attention probabilities.
-        max_position_embeddings (`int`, *optional*, defaults to 512):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the layer normalization layers.
-        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
-            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
-            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
-            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
-            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
-            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        classifier_dropout (`float`, *optional*):
-            The dropout ratio for the classification head.
-
-    Examples:
-
-    ```python
-    >>> from transformers import Data2VecTextModel, Data2VecTextConfig
-
-    >>> # Initializing a Data2VecText edugp/data2vec-nlp-base style configuration
-    >>> configuration = Data2VecTextConfig()
-
-    >>> # Initializing a model from the edugp/data2vec-nlp-base style configuration
-    >>> model = Data2VecTextModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "data2vec_text"
-
-    def __init__(
-        self,
-        vocab_size=30522,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
-        position_embedding_type="absolute",
-        use_cache=True,
-        classifier_dropout=None,
-        **kwargs
-    ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.position_embedding_type = position_embedding_type
-        self.use_cache = use_cache
-        self.classifier_dropout = classifier_dropout
-
-
-class Data2VecTextOnnxConfig(OnnxConfig):
-    @property
-    def inputs(self) -> Mapping[str, Mapping[int, str]]:
-        return OrderedDict(
-            [
-                ("input_ids", {0: "batch", 1: "sequence"}),
-                ("attention_mask", {0: "batch", 1: "sequence"}),
-            ]
-        )
-
-
-class Data2VecAudioConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Data2VecAudioModel`]. It is used to instantiate
-    an Data2VecAudio model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the Data2VecAudio
-    [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h) architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32):
-            Vocabulary size of the Data2VecAudio model. Defines the number of different tokens that can be represented
-            by the `inputs_ids` passed when calling [`Data2VecAudioModel`] or [`TFData2VecAudioModel`]. Vocabulary size
-            of the model. Defines the different tokens that can be represented by the *inputs_ids* passed to the
-            forward method of [`Data2VecAudioModel`].
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the attention probabilities.
-        final_dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability for the final projection layer of [`Data2VecAudioForCTC`].
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the layer normalization layers.
-        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
-            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
-            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
-            convolutional layers.
-        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probability for output of the feature encoder.
-        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
-            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for quantized feature encoder states.
-        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
-            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
-            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
-        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
-            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
-            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
-        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
-            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
-            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
-            *conv_dim*.
-        conv_bias (`bool`, *optional*, defaults to `False`):
-            Whether the 1D convolutional layers have a bias.
-        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
-            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
-            embeddings layer.
-        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
-            Number of groups of 1D convolutional positional embeddings layer.
-        mask_time_prob (`float`, *optional*, defaults to 0.05):
-            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
-            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
-            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
-        mask_time_length (`int`, *optional*, defaults to 10):
-            Length of vector span along the time axis.
-        mask_time_min_masks (`int`, *optional*, defaults to 2),:
-            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
-            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
-            mask_time_min_masks''
-        mask_feature_prob (`float`, *optional*, defaults to 0.0):
-            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
-            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
-            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
-            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
-            True`.
-        mask_feature_length (`int`, *optional*, defaults to 10):
-            Length of vector span along the feature axis.
-        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
-            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
-            step, irrespectively of `mask_feature_prob`. Only relevant if
-            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
-        num_codevectors_per_group (`int`, *optional*, defaults to 320):
-            Number of entries in each quantization codebook (group).
-        num_codevector_groups (`int`, *optional*, defaults to 2):
-            Number of codevector groups for product codevector quantization.
-        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
-            The temperature *kappa* in the contrastive loss.
-        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
-        num_negatives (`int`, *optional*, defaults to 100):
-            Number of negative samples for the contrastive loss.
-        codevector_dim (`int`, *optional*, defaults to 256):
-            Dimensionality of the quantized feature vectors.
-        proj_codevector_dim (`int`, *optional*, defaults to 256):
-            Dimensionality of the final projection of both the quantized and the transformer features.
-        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
-            The weight of the codebook diversity loss component.
-        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
-            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
-            instance of [`Data2VecAudioForCTC`].
-        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
-            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
-            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
-            of [`Data2VecAudioForCTC`].
-        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
-            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
-            instance of [`Data2VecAudioForSequenceClassification`].
-        classifier_proj_size (`int`, *optional*, defaults to 256):
-            Dimensionality of the projection before token mean-pooling for classification.
-        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
-            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
-            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
-        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
-            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
-            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
-        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
-            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
-            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
-        xvector_output_dim (`int`, *optional*, defaults to 512):
-            Dimensionality of the *XVector* embedding vectors.
-        add_adapter (`bool`, *optional*, defaults to `False`):
-            Whether a convolutional network should be stacked on top of the Data2VecAudio Encoder. Can be very useful
-            for warm-starting Data2VecAudio for SpeechEncoderDecoder models.
-        adapter_kernel_size (`int`, *optional*, defaults to 3):
-            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
-        adapter_stride (`int`, *optional*, defaults to 2):
-            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
-        num_adapter_layers (`int`, *optional*, defaults to 3):
-            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
-            True`.
-        output_hidden_size (`int`, *optional*):
-            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
-            if `add_adapter is True`.
-
-    Example:
-
-    ```python
-    >>> from transformers import Data2VecAudioModel, Data2VecAudioConfig
-
-    >>> # Initializing a Data2VecAudio facebook/wav2vec2-base-960h style configuration
-    >>> configuration = Data2VecAudioConfig()
-
-    >>> # Initializing a model from the facebook/wav2vec2-base-960h style configuration
-    >>> model = Data2VecAudioModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "data2vec-audio"
-
-    def __init__(
-        self,
-        vocab_size=32,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout=0.1,
-        activation_dropout=0.1,
-        attention_dropout=0.1,
-        feat_proj_dropout=0.0,
-        feat_quantizer_dropout=0.0,
-        final_dropout=0.1,
-        layerdrop=0.1,
-        initializer_range=0.02,
-        layer_norm_eps=1e-5,
-        feat_extract_norm="group",
-        feat_extract_activation="gelu",
-        conv_dim=(512, 512, 512, 512, 512, 512, 512),
-        conv_stride=(5, 2, 2, 2, 2, 2, 2),
-        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
-        conv_bias=False,
-        num_conv_pos_embedding_groups=16,
-        conv_pos_kernel_size=19,
-        num_conv_pos_embeddings=5,
-        mask_time_prob=0.05,
-        mask_time_length=10,
-        mask_time_min_masks=2,
-        mask_feature_prob=0.0,
-        mask_feature_length=10,
-        mask_feature_min_masks=0,
-        num_codevectors_per_group=320,
-        num_codevector_groups=2,
-        contrastive_logits_temperature=0.1,
-        num_negatives=100,
-        codevector_dim=256,
-        proj_codevector_dim=256,
-        diversity_loss_weight=0.1,
-        ctc_loss_reduction="sum",
-        ctc_zero_infinity=False,
-        use_weighted_layer_sum=False,
-        classifier_proj_size=256,
-        tdnn_dim=(512, 512, 512, 512, 1500),
-        tdnn_kernel=(5, 3, 3, 1, 1),
-        tdnn_dilation=(1, 2, 3, 1, 1),
-        xvector_output_dim=512,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        add_adapter=False,
-        adapter_kernel_size=3,
-        adapter_stride=2,
-        num_adapter_layers=3,
-        output_hidden_size=None,
-        **kwargs
-    ):
-        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
-        self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = list(conv_dim)
-        self.conv_stride = list(conv_stride)
-        self.conv_kernel = list(conv_kernel)
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.conv_pos_kernel_size = conv_pos_kernel_size
-        self.num_feat_extract_layers = len(self.conv_dim)
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout = hidden_dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
-        self.feat_proj_dropout = feat_proj_dropout
-        self.final_dropout = final_dropout
-        self.layerdrop = layerdrop
-        self.layer_norm_eps = layer_norm_eps
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.use_weighted_layer_sum = use_weighted_layer_sum
-
-        if (
-            (len(self.conv_stride) != self.num_feat_extract_layers)
-            or (len(self.conv_kernel) != self.num_feat_extract_layers)
-            or (len(self.conv_dim) != self.num_feat_extract_layers)
-        ):
-            raise ValueError(
-                "Configuration for convolutional layers is incorrect. "
-                "It is required that `len(config.conv_dim)` == `len(config.conv_stride)` == `len(config.conv_kernel)`, "
-                f"but is `len(config.conv_dim) = {len(self.conv_dim)}`, `len(config.conv_stride) "
-                f"= {len(self.conv_stride)}`, `len(config.conv_kernel) = {len(self.conv_kernel)}`."
-            )
-
-        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
-        self.mask_time_prob = mask_time_prob
-        self.mask_time_length = mask_time_length
-        self.mask_time_min_masks = mask_time_min_masks
-        self.mask_feature_prob = mask_feature_prob
-        self.mask_feature_length = mask_feature_length
-        self.mask_feature_min_masks = mask_feature_min_masks
-
-        # parameters for pretraining with codevector quantized representations
-        self.num_codevectors_per_group = num_codevectors_per_group
-        self.num_codevector_groups = num_codevector_groups
-        self.contrastive_logits_temperature = contrastive_logits_temperature
-        self.feat_quantizer_dropout = feat_quantizer_dropout
-        self.num_negatives = num_negatives
-        self.codevector_dim = codevector_dim
-        self.proj_codevector_dim = proj_codevector_dim
-        self.diversity_loss_weight = diversity_loss_weight
-
-        # ctc loss
-        self.ctc_loss_reduction = ctc_loss_reduction
-        self.ctc_zero_infinity = ctc_zero_infinity
-
-        # adapter
-        self.add_adapter = add_adapter
-        self.adapter_kernel_size = adapter_kernel_size
-        self.adapter_stride = adapter_stride
-        self.num_adapter_layers = num_adapter_layers
-        self.output_hidden_size = output_hidden_size or hidden_size
-
-        # SequenceClassification-specific parameter. Feel free to ignore for other classes.
-        self.classifier_proj_size = classifier_proj_size
-
-        # XVector-specific parameters. Feel free to ignore for other classes.
-        self.tdnn_dim = list(tdnn_dim)
-        self.tdnn_kernel = list(tdnn_kernel)
-        self.tdnn_dilation = list(tdnn_dilation)
-        self.xvector_output_dim = xvector_output_dim
diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py
new file mode 100644
index 0000000000000..c6e5780e86294
--- /dev/null
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -0,0 +1,320 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Data2VecText configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/data2vec-base-960h": "https://huggingface.co/facebook/data2vec-audio-base-960h/resolve/main/config.json",
+    # See all Data2VecAudio models at https://huggingface.co/models?filter=data2vec-audio
+}
+
+
+class Data2VecAudioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Data2VecAudioModel`]. It is used to instantiate
+    an Data2VecAudio model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Data2VecAudio
+    [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32):
+            Vocabulary size of the Data2VecAudio model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`Data2VecAudioModel`] or [`TFData2VecAudioModel`]. Vocabulary size
+            of the model. Defines the different tokens that can be represented by the *inputs_ids* passed to the
+            forward method of [`Data2VecAudioModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`Data2VecAudioForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for quantized feature encoder states.
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
+            Number of entries in each quantization codebook (group).
+        num_codevector_groups (`int`, *optional*, defaults to 2):
+            Number of codevector groups for product codevector quantization.
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
+        num_negatives (`int`, *optional*, defaults to 100):
+            Number of negative samples for the contrastive loss.
+        codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the quantized feature vectors.
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the final projection of both the quantized and the transformer features.
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
+            The weight of the codebook diversity loss component.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`Data2VecAudioForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`Data2VecAudioForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`Data2VecAudioForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification.
+        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        add_adapter (`bool`, *optional*, defaults to `False`):
+            Whether a convolutional network should be stacked on top of the Data2VecAudio Encoder. Can be very useful
+            for warm-starting Data2VecAudio for SpeechEncoderDecoder models.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.
+
+    Example:
+
+    ```python
+    >>> from transformers import Data2VecAudioModel, Data2VecAudioConfig
+
+    >>> # Initializing a Data2VecAudio facebook/wav2vec2-base-960h style configuration
+    >>> configuration = Data2VecAudioConfig()
+
+    >>> # Initializing a model from the facebook/wav2vec2-base-960h style configuration
+    >>> model = Data2VecAudioModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "data2vec-audio"
+
+    def __init__(
+        self,
+        vocab_size=32,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        activation_dropout=0.1,
+        attention_dropout=0.1,
+        feat_proj_dropout=0.0,
+        feat_quantizer_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        feat_extract_norm="group",
+        feat_extract_activation="gelu",
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        num_conv_pos_embedding_groups=16,
+        conv_pos_kernel_size=19,
+        num_conv_pos_embeddings=5,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        num_codevectors_per_group=320,
+        num_codevector_groups=2,
+        contrastive_logits_temperature=0.1,
+        num_negatives=100,
+        codevector_dim=256,
+        proj_codevector_dim=256,
+        diversity_loss_weight=0.1,
+        ctc_loss_reduction="sum",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        tdnn_dim=(512, 512, 512, 512, 1500),
+        tdnn_kernel=(5, 3, 3, 1, 1),
+        tdnn_dilation=(1, 2, 3, 1, 1),
+        xvector_output_dim=512,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        add_adapter=False,
+        adapter_kernel_size=3,
+        adapter_stride=2,
+        num_adapter_layers=3,
+        output_hidden_size=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.conv_pos_kernel_size = conv_pos_kernel_size
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. "
+                "It is required that `len(config.conv_dim)` == `len(config.conv_stride)` == `len(config.conv_kernel)`, "
+                f"but is `len(config.conv_dim) = {len(self.conv_dim)}`, `len(config.conv_stride) "
+                f"= {len(self.conv_stride)}`, `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        # parameters for pretraining with codevector quantized representations
+        self.num_codevectors_per_group = num_codevectors_per_group
+        self.num_codevector_groups = num_codevector_groups
+        self.contrastive_logits_temperature = contrastive_logits_temperature
+        self.feat_quantizer_dropout = feat_quantizer_dropout
+        self.num_negatives = num_negatives
+        self.codevector_dim = codevector_dim
+        self.proj_codevector_dim = proj_codevector_dim
+        self.diversity_loss_weight = diversity_loss_weight
+
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+        # adapter
+        self.add_adapter = add_adapter
+        self.adapter_kernel_size = adapter_kernel_size
+        self.adapter_stride = adapter_stride
+        self.num_adapter_layers = num_adapter_layers
+        self.output_hidden_size = output_hidden_size or hidden_size
+
+        # SequenceClassification-specific parameter. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+
+        # XVector-specific parameters. Feel free to ignore for other classes.
+        self.tdnn_dim = list(tdnn_dim)
+        self.tdnn_kernel = list(tdnn_kernel)
+        self.tdnn_dilation = list(tdnn_dilation)
+        self.xvector_output_dim = xvector_output_dim
diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py
new file mode 100644
index 0000000000000..2389afd215840
--- /dev/null
+++ b/src/transformers/models/data2vec/configuration_data2vec_text.py
@@ -0,0 +1,147 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Data2VecText configuration"""
+from collections import OrderedDict
+from typing import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "edugp/data2vec-nlp-base": "https://huggingface.co/data2vec/resolve/main/config.json",
+}
+
+
+class Data2VecTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Data2VecTextModel`] and [`Data2VecTextModel`]. It
+    is used to instantiate a Data2VecText model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2VecText
+    [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import Data2VecTextModel, Data2VecTextConfig
+
+    >>> # Initializing a Data2VecText edugp/data2vec-nlp-base style configuration
+    >>> configuration = Data2VecTextConfig()
+
+    >>> # Initializing a model from the edugp/data2vec-nlp-base style configuration
+    >>> model = Data2VecTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "data2vec_text"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+class Data2VecTextOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
diff --git a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
index 4ffdb27c082d8..e8a703de91f36 100644
--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@@ -24,7 +24,7 @@
 from datasets import load_dataset
 
 from transformers import Wav2Vec2Processor, logging
-from transformers.models.data2vec.configuration_data2vec import Data2VecAudioConfig
+from transformers.models.data2vec.configuration_data2vec_audio import Data2VecAudioConfig
 
 # Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
 from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel as Dummy  # noqa: F401
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 5fd4cd29688c5..aad099fa7dce9 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -31,12 +31,11 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
 )
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput
 from ...modeling_utils import PreTrainedModel, torch_int_div
 from ...utils import logging
-from .configuration_data2vec import Data2VecAudioConfig
+from .configuration_data2vec_audio import Data2VecAudioConfig
 
 
 logger = logging.get_logger(__name__)
@@ -109,48 +108,6 @@ class Data2VecAudioBaseModelOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-@dataclass
-class Data2VecAudioForPreTrainingOutput(ModelOutput):
-    """
-    Output type of [`Data2VecAudioForPreTraining`], with potential hidden states and attentions.
-
-    Args:
-        loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
-            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
-            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
-        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
-            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
-            projected quantized states.
-        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
-            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
-            target vectors for contrastive loss.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
-            The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
-        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
-            The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    projected_states: torch.FloatTensor = None
-    projected_quantized_states: torch.FloatTensor = None
-    codevector_perplexity: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    contrastive_loss: Optional[torch.FloatTensor] = None
-    diversity_loss: Optional[torch.FloatTensor] = None
-
-
 @dataclass
 class XVectorOutput(ModelOutput):
     """
@@ -862,6 +819,8 @@ def _init_weights(self, module):
             k = math.sqrt(1 / module.projection.in_features)
             nn.init.uniform_(module.projection.weight, a=-k, b=k)
             nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, Data2VecAudioPositionalConvLayer):
+            nn.init.constant_(module.conv.bias, 0)
         elif isinstance(module, nn.Linear):
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
 
@@ -1121,213 +1080,6 @@ def forward(
         )
 
 
-@add_start_docstrings("""Data2VecAudio Model with a quantizer and `VQ` head on top.""", DATA2VEC_AUDIO_START_DOCSTRING)
-class Data2VecAudioForPreTraining(Data2VecAudioPreTrainedModel):
-    def __init__(self, config: Data2VecAudioConfig):
-        super().__init__(config)
-        self.data2vec_audio = Data2VecAudioModel(config)
-        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        # make sure that project_hid & project_q are initialized like normal linear layers
-        self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
-        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
-
-    def set_gumbel_temperature(self, temperature: int):
-        """
-        Set the Gumbel softmax temperature to a given value. Only necessary for training
-        """
-        self.quantizer.temperature = temperature
-
-    def freeze_feature_encoder(self):
-        """
-        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
-        not be updated during training.
-        """
-        self.data2vec_audio.feature_extractor._freeze_parameters()
-
-    @staticmethod
-    def compute_contrastive_logits(
-        target_features: torch.FloatTensor,
-        negative_features: torch.FloatTensor,
-        predicted_features: torch.FloatTensor,
-        temperature: int = 0.1,
-    ):
-        """
-        Compute logits for contrastive loss based using cosine similarity as the distance measure between
-        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
-        """
-        target_features = torch.cat([target_features, negative_features], dim=0)
-
-        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(
-            target_features
-        )
-
-        # apply temperature
-        logits = logits / temperature
-        return logits
-
-    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Data2VecAudioForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_values,
-        attention_mask=None,
-        mask_time_indices=None,
-        sampled_negative_indices=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
-            masked extracted features in *config.proj_codevector_dim* space.
-        sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
-            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
-            Required input for pre-training.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> import torch
-        >>> from transformers import Wav2Vec2FeatureExtractor, Data2VecAudioForPreTraining
-        >>> from transformers.models.data2vec.modeling_data2vec import _compute_mask_indices
-        >>> from datasets import load_dataset
-        >>> import soundfile as sf
-
-        >>> feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("patrickvonplaten/data2vec-audio-base")
-        >>> model = Data2VecAudioForPreTraining.from_pretrained("patrickvonplaten/data2vec-audio-base")
-
-
-        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
-        ...     return batch
-
-
-        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        >>> ds = ds.map(map_to_array)
-
-        >>> input_values = feature_extractor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
-
-        >>> # compute masked indices
-        >>> batch_size, raw_sequence_length = input_values.shape
-        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
-        >>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)
-        >>> mask_time_indices = torch.tensor(mask_time_indices, device=input_values.device, dtype=torch.long)
-
-        >>> with torch.no_grad():
-        ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
-
-        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
-        >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
-
-        >>> # show that cosine similarity is much higher than random
-        >>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5
-        tensor(True)
-
-        >>> # for contrastive loss training model should be put into train mode
-        >>> model = model.train()
-        >>> loss = model(input_values, mask_time_indices=mask_time_indices).loss
-        ```"""
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if mask_time_indices is not None:
-            mask_time_indices = mask_time_indices.to(torch.bool)
-
-        outputs = self.data2vec_audio(
-            input_values,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            mask_time_indices=mask_time_indices,
-            return_dict=return_dict,
-        )
-
-        # 1. project all transformed features (including masked) to final vq dim
-        transformer_features = self.project_hid(outputs[0])
-
-        # 2. quantize all (unmasked) extracted features and project to final vq dim
-        extract_features = self.dropout_features(outputs[1])
-
-        if attention_mask is not None:
-            # compute reduced attention_mask correponding to feature vectors
-            attention_mask = self._get_feature_vector_attention_mask(
-                extract_features.shape[1], attention_mask, add_adapter=False
-            )
-
-        quantized_features, codevector_perplexity = self.quantizer(
-            extract_features, mask_time_indices=mask_time_indices
-        )
-        quantized_features = self.project_q(quantized_features)
-
-        loss = contrastive_loss = diversity_loss = None
-        if sampled_negative_indices is not None:
-            batch_size, sequence_length, hidden_size = quantized_features.shape
-
-            # for training, we sample negatives
-            # 3. sample K negatives (distractors) quantized states for contrastive loss
-            # if attention_mask is passed, make sure that padded feature vectors cannot be sampled
-            # sample negative quantized vectors BTC => (BxT)C
-            negative_quantized_features = quantized_features.view(-1, hidden_size)[
-                sampled_negative_indices.long().view(-1)
-            ]
-            negative_quantized_features = negative_quantized_features.view(
-                batch_size, sequence_length, -1, hidden_size
-            ).permute(2, 0, 1, 3)
-
-            # 4. compute logits, corresponding to `logs = sim(c_t, [q_t, \sim{q}_t]) / \kappa`
-            # of equation (3) in https://arxiv.org/pdf/2006.11477.pdf
-            logits = self.compute_contrastive_logits(
-                quantized_features[None, :],
-                negative_quantized_features,
-                transformer_features,
-                self.config.contrastive_logits_temperature,
-            )
-
-            # 5. if a negative vector is identical to the positive (i.e. when codebook utilization is low),
-            # its cosine similarity will be masked
-            neg_is_pos = (quantized_features == negative_quantized_features).all(-1)
-
-            if neg_is_pos.any():
-                logits[1:][neg_is_pos] = float("-inf")
-
-            # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) =
-            # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
-            logits = logits.transpose(0, 2).reshape(-1, logits.size(0))
-            target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten()
-
-            contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum")
-            # 7. compute diversity loss: \mathbf{L}_d
-            num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups
-            diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum()
-
-            # 8. \mathbf{L} = \mathbf{L}_m + \alpha * \mathbf{L}_d
-            loss = contrastive_loss + self.config.diversity_loss_weight * diversity_loss
-
-        if not return_dict:
-            if loss is not None:
-                return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
-            return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
-
-        return Data2VecAudioForPreTrainingOutput(
-            loss=loss,
-            projected_states=transformer_features,
-            projected_quantized_states=quantized_features,
-            codevector_perplexity=codevector_perplexity,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            contrastive_loss=contrastive_loss,
-            diversity_loss=diversity_loss,
-        )
-
-
 @add_start_docstrings(
     """Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     DATA2VEC_AUDIO_START_DOCSTRING,
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 6bc514bcc0dbe..4db194150bdb6 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -46,7 +46,7 @@
     prune_linear_layer,
 )
 from ...utils import logging
-from .configuration_data2vec import Data2VecTextConfig
+from .configuration_data2vec_text import Data2VecTextConfig
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index ceeebdd50df44..9b78968d9e179 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1156,13 +1156,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecAudioForPreTraining(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class Data2VecAudioForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/test_modeling_data2vec_audio.py b/tests/test_modeling_data2vec_audio.py
index 03d97f222fd8d..8a39f3d11e789 100644
--- a/tests/test_modeling_data2vec_audio.py
+++ b/tests/test_modeling_data2vec_audio.py
@@ -12,3 +12,1276 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" Testing suite for the PyTorch Data2VecAudio model. """
+
+import math
+import unittest
+
+import numpy as np
+from datasets import load_dataset
+
+from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
+from transformers import Data2VecAudioConfig, is_torch_available
+from transformers.testing_utils import is_pt_flax_cross_test, require_soundfile, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, _config_zero_init
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Data2VecAudioForAudioFrameClassification,
+        Data2VecAudioForCTC,
+        Data2VecAudioForSequenceClassification,
+        Data2VecAudioForXVector,
+        Data2VecAudioModel,
+        Wav2Vec2FeatureExtractor,
+        Wav2Vec2Processor,
+    )
+    from transformers.models.wav2vec2.modeling_wav2vec2 import (
+        Wav2Vec2GumbelVectorQuantizer,
+        _compute_mask_indices,
+        _sample_negative_indices,
+    )
+
+
+class Data2VecAudioModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=1024,  # speech is longer
+        is_training=False,
+        hidden_size=16,
+        feat_extract_norm="group",
+        feat_extract_dropout=0.0,
+        feat_extract_activation="gelu",
+        conv_dim=(32, 32, 32),
+        conv_stride=(4, 4, 4),
+        conv_kernel=(8, 8, 8),
+        conv_bias=False,
+        num_conv_pos_embeddings=16,
+        num_conv_pos_embedding_groups=2,
+        num_hidden_layers=4,
+        num_attention_heads=2,
+        hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
+        intermediate_size=20,
+        layer_norm_eps=1e-5,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        mask_time_prob=0.5,
+        mask_time_length=2,
+        vocab_size=32,
+        do_stable_layer_norm=False,
+        num_adapter_layers=1,
+        adapter_stride=2,
+        tdnn_dim=(32, 32),
+        tdnn_kernel=(5, 3),
+        tdnn_dilation=(1, 2),
+        xvector_output_dim=32,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_dropout = feat_extract_dropout
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = conv_dim
+        self.conv_stride = conv_stride
+        self.conv_kernel = conv_kernel
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.intermediate_size = intermediate_size
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.do_stable_layer_norm = do_stable_layer_norm
+        self.num_adapter_layers = num_adapter_layers
+        self.adapter_stride = adapter_stride
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.scope = scope
+        self.tdnn_dim = tdnn_dim
+        self.tdnn_kernel = tdnn_kernel
+        self.tdnn_dilation = tdnn_dilation
+        self.xvector_output_dim = xvector_output_dim
+
+        output_seq_length = self.seq_length
+        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
+            output_seq_length = (output_seq_length - (kernel - 1)) / stride
+        self.output_seq_length = int(math.ceil(output_seq_length))
+        self.encoder_seq_length = self.output_seq_length
+
+        self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1
+
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+
+        return config, input_values, attention_mask
+
+    def get_config(self):
+        return Data2VecAudioConfig(
+            hidden_size=self.hidden_size,
+            feat_extract_norm=self.feat_extract_norm,
+            feat_extract_dropout=self.feat_extract_dropout,
+            feat_extract_activation=self.feat_extract_activation,
+            conv_dim=self.conv_dim,
+            conv_stride=self.conv_stride,
+            conv_kernel=self.conv_kernel,
+            conv_bias=self.conv_bias,
+            mask_time_prob=self.mask_time_prob,
+            mask_time_length=self.mask_time_length,
+            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
+            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            intermediate_size=self.intermediate_size,
+            layer_norm_eps=self.layer_norm_eps,
+            do_stable_layer_norm=self.do_stable_layer_norm,
+            hidden_act=self.hidden_act,
+            initializer_range=self.initializer_range,
+            vocab_size=self.vocab_size,
+            num_adapter_layers=self.num_adapter_layers,
+            adapter_stride=self.adapter_stride,
+            tdnn_dim=self.tdnn_dim,
+            tdnn_kernel=self.tdnn_kernel,
+            tdnn_dilation=self.tdnn_dilation,
+            xvector_output_dim=self.xvector_output_dim,
+        )
+
+    def create_and_check_model(self, config, input_values, attention_mask):
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
+        config.add_adapter = True
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask):
+        config.add_adapter = True
+        config.output_hidden_size = 8
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
+        )
+
+    def create_and_check_batch_inference(self, config, input_values, *args):
+        # test does not pass for models making use of `group_norm`
+        # check: https://github.com/pytorch/fairseq/issues/3227
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0.0
+
+        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
+
+        for i in range(input_values.shape[0]):
+            input_slice = input_values[i : i + 1, : input_lengths[i]]
+            output = model(input_slice).last_hidden_state
+
+            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
+            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
+
+    def check_ctc_loss(self, config, input_values, *args):
+        model = Data2VecAudioForCTC(config=config)
+        model.to(torch_device)
+
+        # make sure that dropout is disabled
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0
+
+        model.config.ctc_loss_reduction = "sum"
+        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+
+        model.config.ctc_loss_reduction = "mean"
+        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+
+        self.parent.assertTrue(isinstance(sum_loss, float))
+        self.parent.assertTrue(isinstance(mean_loss, float))
+
+    def check_seq_classifier_loss(self, config, input_values, *args):
+        model = Data2VecAudioForSequenceClassification(config=config)
+        model.to(torch_device)
+
+        # make sure that dropout is disabled
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0
+
+        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+        unmasked_loss = model(input_values, labels=labels).loss.item()
+
+        self.parent.assertTrue(isinstance(masked_loss, float))
+        self.parent.assertTrue(isinstance(unmasked_loss, float))
+        self.parent.assertTrue(masked_loss != unmasked_loss)
+
+    def check_ctc_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForCTC(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze feature encoder
+        model.freeze_feature_encoder()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+            if max_length_labels[i] < labels.shape[-1]:
+                # it's important that we make sure that target lenghts are at least
+                # one shorter than logit lenghts to prevent -inf
+                labels[i, max_length_labels[i] - 1 :] = -100
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_seq_classifier_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForSequenceClassification(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze everything but the classification head
+        model.freeze_base_model()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_xvector_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForXVector(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze everything but the classification head
+        model.freeze_base_model()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_labels_out_of_vocab(self, config, input_values, *args):
+        model = Data2VecAudioForCTC(config)
+        model.to(torch_device)
+        model.train()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)
+
+        with self.parent.assertRaises(ValueError):
+            model(input_values, labels=labels)
+
+    def prepare_config_and_inputs_for_common(self):
+        config, input_values, attention_mask = self.prepare_config_and_inputs()
+        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Data2VecAudioModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (Data2VecAudioForCTC, Data2VecAudioModel, Data2VecAudioForSequenceClassification)
+        if is_torch_available()
+        else ()
+    )
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = Data2VecAudioModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_adapter(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
+
+    def test_model_with_adapter_proj_dim(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
+
+    def test_ctc_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_loss(*config_and_inputs)
+
+    def test_seq_classifier_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
+
+    def test_ctc_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_training(*config_and_inputs)
+
+    def test_seq_classifier_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_training(*config_and_inputs)
+
+    def test_xvector_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_xvector_training(*config_and_inputs)
+
+    def test_labels_out_of_vocab(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
+
+    # Data2VecAudio has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # `input_ids` is renamed to `input_values`
+    def test_forward_signature(self):
+        pass
+
+    # Data2VecAudio cannot resize token embeddings
+    # since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # Data2VecAudio has no inputs_embeds
+    # and thus the `get_input_embeddings` fn
+    # is not implemented
+    def test_model_common_attributes(self):
+        pass
+
+    @is_pt_flax_cross_test
+    # non-robust architecture does not exist in Flax
+    def test_equivalence_flax_to_pt(self):
+        pass
+
+    @is_pt_flax_cross_test
+    # non-robust architecture does not exist in Flax
+    def test_equivalence_pt_to_flax(self):
+        pass
+
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        # set layer drop to 0
+        model.config.layerdrop = 0.0
+
+        input_values = inputs_dict["input_values"]
+
+        input_lengths = torch.tensor(
+            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
+        )
+        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
+
+        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
+        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
+        inputs_dict["labels"] = labels
+
+        outputs = model(**inputs_dict)
+
+        output = outputs[0]
+
+        # Encoder-/Decoder-only models
+        hidden_states = outputs.hidden_states[0]
+        attentions = outputs.attentions[0]
+
+        hidden_states.retain_grad()
+        attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+        self.assertIsNotNone(attentions.grad)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = [
+                    "conv.weight",
+                    "masked_spec_embed",
+                    "codevectors",
+                    "quantizer.weight_proj.weight",
+                    "project_hid.weight",
+                    "project_hid.bias",
+                    "project_q.weight",
+                    "project_q.bias",
+                    "feature_projection.projection.weight",
+                    "feature_projection.projection.bias",
+                    "objective.weight",
+                ]
+                if param.requires_grad:
+                    if any([x in name for x in uniform_init_parms]):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.data.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+        if hasattr(module, "codevectors") and module.codevectors is not None:
+            module.codevectors.data.fill_(3)
+        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
+            module.masked_spec_embed.data.fill_(3)
+
+    def test_mask_feature_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "patrickvonplaten/data2vec-base-960h", mask_feature_prob=0.2, mask_feature_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    def test_mask_time_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "patrickvonplaten/data2vec-base-960h", mask_time_prob=0.2, mask_time_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    @unittest.skip(reason="Feed forward chunking is not implemented")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = Data2VecAudioModel.from_pretrained("patrickvonplaten/data2vec-base")
+        self.assertIsNotNone(model)
+
+
+@require_torch
+class Data2VecAudioRobustModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            Data2VecAudioForCTC,
+            Data2VecAudioModel,
+            Data2VecAudioForSequenceClassification,
+            Data2VecAudioForAudioFrameClassification,
+            Data2VecAudioForXVector,
+        )
+        if is_torch_available()
+        else ()
+    )
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = Data2VecAudioModelTester(
+            self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True
+        )
+        self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_adapter(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
+
+    def test_model_with_adapter_proj_dim(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
+
+    def test_batched_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
+
+    def test_ctc_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_loss(*config_and_inputs)
+
+    def test_seq_classifier_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
+
+    def test_ctc_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_training(*config_and_inputs)
+
+    def test_seq_classifier_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_training(*config_and_inputs)
+
+    def test_xvector_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_xvector_training(*config_and_inputs)
+
+    def test_labels_out_of_vocab(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
+
+    # Data2VecAudio has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # `input_ids` is renamed to `input_values`
+    def test_forward_signature(self):
+        pass
+
+    # Data2VecAudio cannot resize token embeddings
+    # since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # Data2VecAudio has no inputs_embeds
+    # and thus the `get_input_embeddings` fn
+    # is not implemented
+    def test_model_common_attributes(self):
+        pass
+
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        # set layer drop to 0
+        model.config.layerdrop = 0.0
+
+        input_values = inputs_dict["input_values"]
+
+        input_lengths = torch.tensor(
+            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
+        )
+        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
+
+        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
+        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
+        inputs_dict["labels"] = labels
+
+        outputs = model(**inputs_dict)
+
+        output = outputs[0]
+
+        # Encoder-/Decoder-only models
+        hidden_states = outputs.hidden_states[0]
+        attentions = outputs.attentions[0]
+
+        hidden_states.retain_grad()
+        attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+        self.assertIsNotNone(attentions.grad)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = [
+                    "conv.weight",
+                    "masked_spec_embed",
+                    "codevectors",
+                    "quantizer.weight_proj.weight",
+                    "project_hid.weight",
+                    "project_hid.bias",
+                    "project_q.weight",
+                    "project_q.bias",
+                    "feature_projection.projection.weight",
+                    "feature_projection.projection.bias",
+                    "objective.weight",
+                ]
+                if param.requires_grad:
+                    if any([x in name for x in uniform_init_parms]):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.data.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+        if hasattr(module, "codevectors") and module.codevectors is not None:
+            module.codevectors.data.fill_(3)
+        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
+            module.masked_spec_embed.data.fill_(3)
+
+    def test_mask_feature_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "patrickvonplaten/data2vec-base-960h", mask_feature_prob=0.2, mask_feature_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    def test_mask_time_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "patrickvonplaten/data2vec-base-960h", mask_time_prob=0.2, mask_time_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    def test_mask_time_feature_prob_ctc_single_batch(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "patrickvonplaten/data2vec-base-960h",
+            mask_time_prob=0.2,
+            mask_feature_prob=0.2,
+            mask_time_length=2,
+            mask_feature_length=2,
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (1, 1498, 32))
+
+    @unittest.skip(reason="Feed forward chunking is not implemented")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = Data2VecAudioModel.from_pretrained("patrickvonplaten/data2vec-base")
+        self.assertIsNotNone(model)
+
+
+@require_torch
+class Data2VecAudioUtilsTest(unittest.TestCase):
+    def test_compute_mask_indices(self):
+        batch_size = 4
+        sequence_length = 60
+        mask_prob = 0.5
+        mask_length = 1
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
+
+    def test_compute_mask_indices_low_prob(self):
+        # with these settings num_masked_spans=0.5, which means probabilistic rounding
+        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
+        # the other 5 out of 10, cases num_masked_spans=1
+        n_trials = 100
+        batch_size = 4
+        sequence_length = 100
+        mask_prob = 0.05
+        mask_length = 10
+
+        count_dimensions_masked = 0
+        count_dimensions_not_masked = 0
+
+        for _ in range(n_trials):
+            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+            mask = torch.from_numpy(mask).to(torch_device)
+
+            num_masks = torch.sum(mask).item()
+
+            if num_masks > 0:
+                count_dimensions_masked += 1
+            else:
+                count_dimensions_not_masked += 1
+
+        # as we test for at least 10 masked dimension and at least
+        # 10 non-masked dimension, this test could fail with probability:
+        # P(100 coin flips, at most 9 heads) = 1.66e-18
+        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
+        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
+
+    def test_compute_mask_indices_overlap(self):
+        batch_size = 4
+        sequence_length = 80
+        mask_prob = 0.5
+        mask_length = 4
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
+        for batch_sum in mask.sum(axis=-1):
+            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
+
+    def test_compute_mask_indices_attn_mask_overlap(self):
+        batch_size = 4
+        sequence_length = 80
+        mask_prob = 0.5
+        mask_length = 4
+
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        attention_mask[:2, sequence_length // 2 :] = 0
+
+        mask = _compute_mask_indices(
+            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
+        )
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        for batch_sum in mask.sum(axis=-1):
+            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
+
+        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
+
+    def test_compute_mask_indices_short_audio(self):
+        batch_size = 4
+        sequence_length = 100
+        mask_prob = 0.05
+        mask_length = 10
+
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        # force one example to be heavily padded
+        attention_mask[0, 5:] = 0
+
+        mask = _compute_mask_indices(
+            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
+        )
+
+        # make sure that non-padded examples cannot be padded
+        self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any())
+
+    def test_compute_perplexity(self):
+        probs = torch.arange(100, device=torch_device).reshape(2, 5, 10) / 100
+
+        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs)
+        self.assertTrue(abs(ppl.item() - 141.4291) < 1e-3)
+
+        # mask half of the input
+        mask = torch.ones((2,), device=torch_device, dtype=torch.bool)
+        mask[0] = 0
+
+        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs, mask)
+        self.assertTrue(abs(ppl.item() - 58.6757) < 1e-3)
+
+    def test_sample_negatives(self):
+        batch_size = 2
+        sequence_length = 10
+        hidden_size = 4
+        num_negatives = 3
+
+        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
+            sequence_length, hidden_size
+        )  # each value in vector consits of same value
+        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
+
+        # sample negative indices
+        sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None)
+        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
+        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
+        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
+        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
+
+        # make sure no negatively sampled vector is actually a positive one
+        for negative in negatives:
+            self.assertTrue(((negative - features) == 0).sum() == 0.0)
+
+        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
+        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
+
+    def test_sample_negatives_with_mask(self):
+        batch_size = 2
+        sequence_length = 10
+        hidden_size = 4
+        num_negatives = 3
+
+        # second half of last input tensor is padded
+        mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        mask[-1, sequence_length // 2 :] = 0
+
+        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
+            sequence_length, hidden_size
+        )  # each value in vector consits of same value
+        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
+
+        # replace masked feature vectors with -100 to test that those are not sampled
+        features = torch.where(mask[:, :, None].expand(features.shape).bool(), features, -100)
+
+        # sample negative indices
+        sampled_negative_indices = _sample_negative_indices(
+            (batch_size, sequence_length), num_negatives, mask.cpu().numpy()
+        )
+        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
+        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
+        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
+
+        self.assertTrue((negatives >= 0).all().item())
+
+        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
+
+        # make sure no negatively sampled vector is actually a positive one
+        for negative in negatives:
+            self.assertTrue(((negative - features) == 0).sum() == 0.0)
+
+        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
+        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
+
+
+@require_torch
+@require_soundfile
+@slow
+class Data2VecAudioModelIntegrationTest(unittest.TestCase):
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").filter(
+            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
+        )[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def _load_superb(self, task, num_samples):
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+
+        return ds[:num_samples]
+
+    def test_inference_ctc_normal(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
+        model.to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+        input_speech = self._load_datasamples(1)
+
+        input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_ctc_normal_batched(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
+        model.to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+
+        input_speech = self._load_datasamples(2)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "a man said to the universe sir i exist",
+            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
+        ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_ctc_robust_batched(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+
+        input_speech = self._load_datasamples(4)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values, attention_mask=attention_mask).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "a man said to the universe sir i exist",
+            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
+            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
+            "his instant panic was followed by a small sharp blow high on his chest",
+        ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_keyword_spotting(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+        input_data = self._load_superb("ks", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
+
+        expected_labels = [7, 6, 10, 9]
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor([6.1186, 11.8961, 10.2931, 6.0898], device=torch_device)
+
+        self.assertListEqual(predicted_ids.tolist(), expected_labels)
+        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
+
+    def test_inference_intent_classification(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("ic", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+
+        predicted_logits_action, predicted_ids_action = torch.max(outputs.logits[:, :6], dim=-1)
+        predicted_logits_object, predicted_ids_object = torch.max(outputs.logits[:, 6:20], dim=-1)
+        predicted_logits_location, predicted_ids_location = torch.max(outputs.logits[:, 20:24], dim=-1)
+
+        expected_labels_action = [0, 0, 2, 3]
+        expected_logits_action = torch.tensor([0.4568, 11.0848, 1.6621, 9.3841], device=torch_device)
+        expected_labels_object = [3, 10, 3, 4]
+        expected_logits_object = torch.tensor([1.5322, 10.7094, 5.2469, 22.1318], device=torch_device)
+        expected_labels_location = [0, 0, 0, 1]
+        expected_logits_location = torch.tensor([1.5335, 6.5096, 10.5704, 11.0569], device=torch_device)
+
+        self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
+        self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
+        self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)
+
+        self.assertTrue(torch.allclose(predicted_logits_action, expected_logits_action, atol=1e-2))
+        self.assertTrue(torch.allclose(predicted_logits_object, expected_logits_object, atol=1e-2))
+        self.assertTrue(torch.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
+
+    def test_inference_speaker_identification(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("si", 4)
+
+        output_logits = []
+        with torch.no_grad():
+            for example in input_data["speech"]:
+                input = processor(example, return_tensors="pt", padding=True)
+                output = model(input.input_values.to(torch_device), attention_mask=None)
+                output_logits.append(output.logits[0])
+        output_logits = torch.stack(output_logits)
+        predicted_logits, predicted_ids = torch.max(output_logits, dim=-1)
+
+        expected_labels = [251, 1, 1, 3]
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor([37.5627, 71.6362, 64.2419, 31.7778], device=torch_device)
+
+        self.assertListEqual(predicted_ids.tolist(), expected_labels)
+        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
+
+    def test_inference_emotion_recognition(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("er", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
+
+        expected_labels = [1, 1, 2, 2]
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor([2.1722, 3.0779, 8.0287, 6.6797], device=torch_device)
+
+        self.assertListEqual(predicted_ids.tolist(), expected_labels)
+        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
+
+    def test_phoneme_recognition(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("superb/wav2vec2-base-superb-ic")
+
+        input_speech = self._load_datasamples(4)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values, attention_mask=attention_mask).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "ɐ m æ n s ɛ d t ə ð ə j uː n ɪ v ɚ s s ɚ aɪ ɛ ɡ z ɪ s t",
+            "s w ɛ t k ʌ v ɚ d b ɹ iː ɔ n z b ɑː d i t ɹ ɪ k l ɪ ŋ ɪ n t ə ð ə t aɪ t l oɪ n k l ɑː θ ð æ w ʌ z ð ɪ oʊ n l i ɡ ɑːɹ m ə n t h iː w ɔːɹ",
+            "ð ə k aɪ t ɔ n h ɪ z tʃ ɛ s t s t ɪ l d ɹ ɪ p ɪ ŋ b l ʌ d ð ɪ eɪ k ʌ v h ɪ z oʊ v ɚ s t ɹ eɪ n d aɪ z iː v ə n ð ə s ɔːɹ ɹ ɪ ŋ ɐ ɹ iː n ɐ ɚ ɹ aʊ n d h ɪ m w ɪ ð ə θ aʊ z ə n d z ʌ v s p ɛ k t eɪ ɾ ɚ z w ɜː t ɹ ɪ v ɪ æ l ᵻ ɾ i z n ɑː t w ɜː θ θ ɪ ŋ k ɪ ŋ ɐ b aʊ t",
+            "h ɪ z ɪ n s t ə n t v p æ n ɪ k w ʌ z f ɑː l oʊ d b aɪ ɐ s m ɔː l ʃ ɑːɹ p b l oʊ h aɪ ɔ n h ɪ z tʃ ɛ s t",
+        ]
+        # should correspond to =>:
+        # [
+        # "a man said to the universe sir i exist",
+        # "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
+        # "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
+        # "his instant panic was followed by a small sharp blow high on his chest",
+        # ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_diarization(self):
+        model = Data2VecAudioForAudioFrameClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("sd", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+        # labels is a one-hot array of shape (num_frames, num_speakers)
+        labels = (outputs.logits > 0).long()
+
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor(
+            [
+                [[-5.2807, -5.1272], [-5.4059, -4.7757], [-5.2764, -4.9621], [-5.0117, -4.5851]],
+                [[-1.7643, -0.5462], [-1.7369, -0.2649], [-1.5066, -0.6200], [-4.5703, -2.4863]],
+                [[-0.8656, -0.4783], [-0.8899, -0.3289], [-0.9267, -0.5781], [-0.7817, -0.4619]],
+                [[-4.8625, -2.5316], [-5.2339, -2.2155], [-4.9835, -2.0344], [-4.4727, -1.8421]],
+            ],
+            device=torch_device,
+        )
+        self.assertEqual(labels[0, :, 0].sum(), 555)
+        self.assertEqual(labels[0, :, 1].sum(), 299)
+        # TODO: update the tolerance after the CI moves to torch 1.10
+        self.assertTrue(torch.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2))
+
+    def test_inference_speaker_verification(self):
+        model = Data2VecAudioForXVector.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("si", 4)
+
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
+        labels = torch.tensor([5, 1, 1, 3], device=torch_device).T
+
+        with torch.no_grad():
+            input_values = inputs.input_values.to(torch_device)
+            attention_mask = inputs.attention_mask.to(torch_device)
+            outputs = model(input_values, attention_mask=attention_mask, labels=labels)
+        embeddings = torch.nn.functional.normalize(outputs.embeddings, dim=-1).cpu()
+
+        cosine_sim = torch.nn.CosineSimilarity(dim=-1)
+        # id10002 vs id10002
+        self.assertAlmostEqual(cosine_sim(embeddings[1], embeddings[2]).numpy(), 0.9758, 3)
+        # id10006 vs id10002
+        self.assertAlmostEqual(cosine_sim(embeddings[0], embeddings[1]).numpy(), 0.7579, 3)
+        # id10002 vs id10004
+        self.assertAlmostEqual(cosine_sim(embeddings[2], embeddings[3]).numpy(), 0.7594, 3)
+
+        # TODO: update the tolerance after the CI moves to torch 1.10
+        self.assertAlmostEqual(outputs.loss.item(), 17.7963, 2)
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 4f532defe012e..4c9d870cf4197 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -85,7 +85,6 @@
     "TFRobertaForMultipleChoice",  # TODO: fix
     "TrOCRDecoderWrapper",  # Building part of bigger (tested) model.
     "SeparableConv1D",  # Building part of bigger (tested) model.
-    "Data2VecAudioModel",  # TODO: Add tests
 ]
 
 # Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't
@@ -103,7 +102,6 @@
     "test_modeling_xlm_roberta.py",
     "test_modeling_vision_text_dual_encoder.py",
     "test_modeling_flax_vision_text_dual_encoder.py",
-    "test_modeling_data2vec_audio.py",  # TODO: Add tests
 ]
 
 # Update this list for models that are not in any of the auto MODEL_XXX_MAPPING. Being in this list is an exception and
@@ -167,7 +165,6 @@
     "TFWav2Vec2ForCTC",
     "TFHubertForCTC",
     "Data2VecAudioModel",  # TODO: Fix
-    "Data2VecAudioForPreTraining",  # TODO: Fix
 ]
 
 # This is to make sure the transformers module imported is the one in the repo.

From 0b6e4baeebf050fe989176ebb6c01de11f935f9f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 13:43:25 +0000
Subject: [PATCH 043/121] add inputs to logits to data2vec'

---
 .../models/data2vec/configuration_data2vec_audio.py          | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py
index b2d1856a63b5e..3e8176a119003 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_audio.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -16,6 +16,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+import math
 
 
 logger = logging.get_logger(__name__)
@@ -318,3 +319,7 @@ def __init__(
         self.tdnn_kernel = list(tdnn_kernel)
         self.tdnn_dilation = list(tdnn_dilation)
         self.xvector_output_dim = xvector_output_dim
+
+    @property
+    def inputs_to_logits_ratio(self):
+        return math.prod(self.conv_stride)

From d3e6d27f2b44612e54528a82c6898c46c7076b27 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 14:28:50 +0000
Subject: [PATCH 044/121] correct autio models

---
 .../models/auto/configuration_auto.py         |    8 +-
 src/transformers/models/auto/modeling_auto.py |   30 +-
 .../data2vec/configuration_data2vec_audio.py  |    9 +-
 .../data2vec/configuration_data2vec_text.py   |    2 +-
 .../models/data2vec/data2vec_audio.py         |    4 +-
 .../data2vec/modeling_data2vec_audio.py       |    5 +-
 .../models/data2vec/modeling_data2vec_text.py |    2 +-
 tests/test_modeling_common.py                 |    1 +
 tests/test_modeling_data2vec.py               | 1520 +++++++++++++++++
 tests/test_modeling_data2vec_audio.py         |  264 +--
 tests/test_modeling_data2vec_text.py          |    1 -
 11 files changed, 1552 insertions(+), 294 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index a63fc53f497ef..fa96166f5b079 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -122,6 +122,8 @@
         ("unispeech-sat", "UniSpeechSatConfig"),
         ("unispeech", "UniSpeechConfig"),
         ("wavlm", "WavLMConfig"),
+        ("data2vec-audio", "Data2VecAudioConfig"),
+        ("data2vec-text", "Data2VecTextConfig"),
     ]
 )
 
@@ -324,10 +326,14 @@
         ("xlsr_wav2vec2", "XLSR-Wav2Vec2"),
         ("mluke", "mLUKE"),
         ("layoutxlm", "LayoutXLM"),
+        ("data2vec-audio", "Data2VecAudio"),
+        ("data2vec-text", "Data2VecText"),
     ]
 )
 
-SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict([("openai-gpt", "openai")])
+SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
+    [("openai-gpt", "openai"), ("data2vec-audio", "data2vec"), ("data2vec-text", "data2vec")]
+)
 
 
 def model_type_to_module_name(key):
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 4ebbe0f9bd4f4..25b49b309799a 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -83,7 +83,8 @@
         ("bart", "BartModel"),
         ("longformer", "LongformerModel"),
         ("roberta", "RobertaModel"),
-        ("data2vec", "Data2VecTextModel"),
+        ("data2vec-text", "Data2VecTextModel"),
+        ("data2vec-audio", "Data2VecAudioModel"),
         ("layoutlm", "LayoutLMModel"),
         ("squeezebert", "SqueezeBertModel"),
         ("bert", "BertModel"),
@@ -134,7 +135,7 @@
         ("fsmt", "FSMTForConditionalGeneration"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
-        ("data2vec", "Data2VecTextForMaskedLM"),
+        ("data2vec-text", "Data2VecTextForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForPreTraining"),
         ("big_bird", "BigBirdForPreTraining"),
@@ -193,7 +194,7 @@
         ("bart", "BartForConditionalGeneration"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
-        ("data2vec", "Data2VecTextForMaskedLM"),
+        ("data2vec-text", "Data2VecTextForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForMaskedLM"),
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
@@ -234,7 +235,6 @@
         ("xlm-roberta-xl", "XLMRobertaXLForCausalLM"),
         ("xlm-roberta", "XLMRobertaForCausalLM"),
         ("roberta", "RobertaForCausalLM"),
-        ("data2vec", "Data2VecForCausalLM"),
         ("bert", "BertLMHeadModel"),
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
         ("gpt2", "GPT2LMHeadModel"),
@@ -255,7 +255,7 @@
         ("blenderbot-small", "BlenderbotSmallForCausalLM"),
         ("megatron-bert", "MegatronBertForCausalLM"),
         ("speech_to_text_2", "Speech2Text2ForCausalLM"),
-        ("data2vec", "Data2VecTextForCausalLM"),
+        ("data2vec-text", "Data2VecTextForCausalLM"),
     ]
 )
 
@@ -342,7 +342,7 @@
         ("xlm-roberta", "XLMRobertaForMaskedLM"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
-        ("data2vec", "Data2VecTextForMaskedLM"),
+        ("data2vec-text", "Data2VecTextForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForMaskedLM"),
         ("megatron-bert", "MegatronBertForMaskedLM"),
@@ -423,7 +423,7 @@
         ("bart", "BartForSequenceClassification"),
         ("longformer", "LongformerForSequenceClassification"),
         ("roberta", "RobertaForSequenceClassification"),
-        ("data2vec", "Data2VecTextForSequenceClassification"),
+        ("data2vec-text", "Data2VecTextForSequenceClassification"),
         ("squeezebert", "SqueezeBertForSequenceClassification"),
         ("layoutlm", "LayoutLMForSequenceClassification"),
         ("bert", "BertForSequenceClassification"),
@@ -473,7 +473,6 @@
         ("xlm-roberta-xl", "XLMRobertaXLForQuestionAnswering"),
         ("xlm-roberta", "XLMRobertaForQuestionAnswering"),
         ("roberta", "RobertaForQuestionAnswering"),
-        ("data2vec", "Data2VecForQuestionAnswering"),
         ("squeezebert", "SqueezeBertForQuestionAnswering"),
         ("bert", "BertForQuestionAnswering"),
         ("xlnet", "XLNetForQuestionAnsweringSimple"),
@@ -490,7 +489,7 @@
         ("deberta-v2", "DebertaV2ForQuestionAnswering"),
         ("ibert", "IBertForQuestionAnswering"),
         ("splinter", "SplinterForQuestionAnswering"),
-        ("data2vec", "Data2VecTextForQuestionAnswering"),
+        ("data2vec-text", "Data2VecTextForQuestionAnswering"),
     ]
 )
 
@@ -523,7 +522,6 @@
         ("xlm-roberta", "XLMRobertaForTokenClassification"),
         ("longformer", "LongformerForTokenClassification"),
         ("roberta", "RobertaForTokenClassification"),
-        ("data2vec", "Data2VecForTokenClassification"),
         ("squeezebert", "SqueezeBertForTokenClassification"),
         ("bert", "BertForTokenClassification"),
         ("megatron-bert", "MegatronBertForTokenClassification"),
@@ -537,7 +535,7 @@
         ("deberta-v2", "DebertaV2ForTokenClassification"),
         ("gpt2", "GPT2ForTokenClassification"),
         ("ibert", "IBertForTokenClassification"),
-        ("data2vec", "Data2VecTextForTokenClassification"),
+        ("data2vec-text", "Data2VecTextForTokenClassification"),
     ]
 )
 
@@ -559,7 +557,7 @@
         ("xlm-roberta", "XLMRobertaForMultipleChoice"),
         ("longformer", "LongformerForMultipleChoice"),
         ("roberta", "RobertaForMultipleChoice"),
-        ("data2vec", "Data2VecTextForMultipleChoice"),
+        ("data2vec-text", "Data2VecTextForMultipleChoice"),
         ("squeezebert", "SqueezeBertForMultipleChoice"),
         ("bert", "BertForMultipleChoice"),
         ("distilbert", "DistilBertForMultipleChoice"),
@@ -595,7 +593,7 @@
         ("sew", "SEWForSequenceClassification"),
         ("sew-d", "SEWDForSequenceClassification"),
         ("wavlm", "WavLMForSequenceClassification"),
-        ("data2vec", "Data2VecAudioForSequenceClassification"),
+        ("data2vec-audio", "Data2VecAudioForSequenceClassification"),
     ]
 )
 
@@ -609,7 +607,7 @@
         ("sew", "SEWForCTC"),
         ("sew-d", "SEWDForCTC"),
         ("wavlm", "WavLMForCTC"),
-        ("data2vec", "Data2VecAudioForCTC"),
+        ("data2vec-audio", "Data2VecAudioForCTC"),
     ]
 )
 
@@ -619,7 +617,7 @@
         ("wav2vec2", "Wav2Vec2ForAudioFrameClassification"),
         ("unispeech-sat", "UniSpeechSatForAudioFrameClassification"),
         ("wavlm", "WavLMForAudioFrameClassification"),
-        ("data2vec", "Data2VecAudioForAudioFrameClassification"),
+        ("data2vec-audio", "Data2VecAudioForAudioFrameClassification"),
     ]
 )
 
@@ -629,7 +627,7 @@
         ("wav2vec2", "Wav2Vec2ForXVector"),
         ("unispeech-sat", "UniSpeechSatForXVector"),
         ("wavlm", "WavLMForXVector"),
-        ("data2vec", "Data2VecAudioForXVector"),
+        ("data2vec-audio", "Data2VecAudioForXVector"),
     ]
 )
 
diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py
index 5de57bb2edbaa..352ebe23ab5cd 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_audio.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -14,9 +14,10 @@
 # limitations under the License.
 """ Data2VecText configuration"""
 
+import math
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-import math
 
 
 logger = logging.get_logger(__name__)
@@ -65,10 +66,6 @@ class Data2VecAudioConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
-            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
-            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
-            convolutional layers.
         feat_proj_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for output of the feature encoder.
         feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
@@ -203,7 +200,6 @@ def __init__(
         layerdrop=0.1,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
-        feat_extract_norm="group",
         feat_extract_activation="gelu",
         conv_dim=(512, 512, 512, 512, 512, 512, 512),
         conv_stride=(5, 2, 2, 2, 2, 2, 2),
@@ -245,7 +241,6 @@ def __init__(
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
         self.feat_extract_activation = feat_extract_activation
         self.conv_dim = list(conv_dim)
         self.conv_stride = list(conv_stride)
diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py
index 2389afd215840..3c7d6765dd314 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_text.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_text.py
@@ -93,7 +93,7 @@ class Data2VecTextConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "data2vec_text"
+    model_type = "data2vec-text"
 
     def __init__(
         self,
diff --git a/src/transformers/models/data2vec/data2vec_audio.py b/src/transformers/models/data2vec/data2vec_audio.py
index a5d91b82186f1..1c290d209d31e 100644
--- a/src/transformers/models/data2vec/data2vec_audio.py
+++ b/src/transformers/models/data2vec/data2vec_audio.py
@@ -452,13 +452,13 @@ def compute_var(y):
         if dist.is_initialized():
             zc = torch.tensor(y.size(0)).cuda()
             zs = y.sum(dim=0)
-            zss = (y ** 2).sum(dim=0)
+            zss = (y**2).sum(dim=0)
 
             dist.all_reduce(zc)
             dist.all_reduce(zs)
             dist.all_reduce(zss)
 
-            var = zss / (zc - 1) - (zs ** 2) / (zc * (zc - 1))
+            var = zss / (zc - 1) - (zs**2) / (zc * (zc - 1))
             return torch.sqrt(var + 1e-6).mean()
         else:
             return torch.sqrt(y.var(dim=0) + 1e-6).mean()
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index aad099fa7dce9..78e071a0064ef 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -33,7 +33,8 @@
     add_start_docstrings_to_model_forward,
 )
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput
-from ...modeling_utils import PreTrainedModel, torch_int_div
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_int_div
 from ...utils import logging
 from .configuration_data2vec_audio import Data2VecAudioConfig
 
@@ -455,7 +456,7 @@ def __init__(
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                 f" and `num_heads`: {num_heads})."
             )
-        self.scaling = self.head_dim ** -0.5
+        self.scaling = self.head_dim**-0.5
         self.is_decoder = is_decoder
 
         self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 4db194150bdb6..720e41481ffd4 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -66,7 +66,7 @@
 _CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
 _CTC_EXPECTED_LOSS = 53.48
 
-DATA2VECTEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "data2vec",
     # See all data2vec models at https://huggingface.co/models?filter=data2vec
 ]
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 17888bcfac380..21ee53a824b0d 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1911,6 +1911,7 @@ def test_load_with_mismatched_shapes(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
+            print(model_class)
             if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
                 continue
 
diff --git a/tests/test_modeling_data2vec.py b/tests/test_modeling_data2vec.py
index 03d97f222fd8d..15ceb29ff9e0f 100644
--- a/tests/test_modeling_data2vec.py
+++ b/tests/test_modeling_data2vec.py
@@ -12,3 +12,1523 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" Testing suite for the PyTorch Data2VecAudio model. """
+
+import math
+import unittest
+
+import numpy as np
+from datasets import load_dataset
+
+from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
+from transformers import Data2VecAudioConfig, Data2VecTextConfig, is_torch_available
+from transformers.testing_utils import (
+    TestCasePlus,
+    is_pt_flax_cross_test,
+    require_soundfile,
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, _config_zero_init
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Data2VecAudioForAudioFrameClassification,
+        Data2VecAudioForCTC,
+        Data2VecAudioForSequenceClassification,
+        Data2VecAudioForXVector,
+        Data2VecAudioModel,
+        Data2VecTextForCausalLM,
+        Data2VecTextForMaskedLM,
+        Data2VecTextForMultipleChoice,
+        Data2VecTextForQuestionAnswering,
+        Data2VecTextForSequenceClassification,
+        Data2VecTextForTokenClassification,
+        Data2VecTextModel,
+        Wav2Vec2FeatureExtractor,
+        Wav2Vec2Processor,
+    )
+    from transformers.models.data2vec.modeling_data2vec_text import (
+        DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        Data2VecTextForTextEmbeddings,
+        create_position_ids_from_input_ids,
+    )
+    from transformers.models.wav2vec2.modeling_wav2vec2 import (
+        Wav2Vec2GumbelVectorQuantizer,
+        _compute_mask_indices,
+        _sample_negative_indices,
+    )
+
+
+class Data2VecTextModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return Data2VecTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = Data2VecTextModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = Data2VecTextForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = Data2VecTextForCausalLM(config=config).to(torch_device).eval()
+
+        # make sure that ids don't start with pad token
+        mask = input_ids.ne(config.pad_token_id).long()
+        input_ids = input_ids * mask
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+
+        # make sure that ids don't start with pad token
+        mask = next_tokens.ne(config.pad_token_id).long()
+        next_tokens = next_tokens * mask
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = Data2VecTextForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = Data2VecTextForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Data2VecTextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            Data2VecTextForCausalLM,
+            Data2VecTextForMaskedLM,
+            Data2VecTextModel,
+            Data2VecTextForSequenceClassification,
+            Data2VecTextForTokenClassification,
+            Data2VecTextForMultipleChoice,
+            Data2VecTextForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (Data2VecTextForCausalLM,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = Data2VecTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Data2VecTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = Data2VecTextModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_create_position_ids_respects_padding_index(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = Data2VecTextForTextEmbeddings(config=config)
+
+        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
+        expected_positions = torch.as_tensor(
+            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
+        )
+
+        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+    def test_create_position_ids_from_inputs_embeds(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        embeddings = Data2VecTextForTextEmbeddings(config=config)
+
+        inputs_embeds = torch.empty(2, 4, 30)
+        expected_single_positions = [
+            0 + embeddings.padding_idx + 1,
+            1 + embeddings.padding_idx + 1,
+            2 + embeddings.padding_idx + 1,
+            3 + embeddings.padding_idx + 1,
+        ]
+        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
+        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+
+@require_torch
+class Data2VecTextModelIntegrationTest(TestCasePlus):
+    @slow
+    def test_inference_masked_lm(self):
+        model = Data2VecTextForMaskedLM.from_pretrained("data2vec")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 50265))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
+        )
+
+        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
+        # data2vec.eval()
+        # expected_slice = data2vec.model.forward(input_ids)[0][:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_no_head(self):
+        model = Data2VecTextModel.from_pretrained("data2vec")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
+        )
+
+        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
+        # data2vec.eval()
+        # expected_slice = data2vec.extract_features(input_ids)[:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_classification_head(self):
+        model = Data2VecTextForSequenceClassification.from_pretrained("data2vec-large-mnli")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 3))
+        self.assertEqual(output.shape, expected_shape)
+        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
+
+        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.large.mnli')
+        # data2vec.eval()
+        # expected_tensor = data2vec.predict("mnli", input_ids, return_logits=True).detach()
+
+        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
+
+
+class Data2VecAudioModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=1024,  # speech is longer
+        is_training=False,
+        hidden_size=16,
+        feat_extract_dropout=0.0,
+        feat_extract_activation="gelu",
+        conv_dim=(32, 32, 32),
+        conv_stride=(4, 4, 4),
+        conv_kernel=(8, 8, 8),
+        conv_bias=False,
+        num_conv_pos_embeddings=16,
+        num_conv_pos_embedding_groups=2,
+        num_hidden_layers=4,
+        num_attention_heads=2,
+        hidden_dropout_prob=0.1,
+        intermediate_size=20,
+        layer_norm_eps=1e-5,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        mask_time_prob=0.5,
+        mask_time_length=2,
+        vocab_size=32,
+        num_adapter_layers=1,
+        adapter_stride=2,
+        tdnn_dim=(32, 32),
+        tdnn_kernel=(5, 3),
+        tdnn_dilation=(1, 2),
+        xvector_output_dim=32,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.feat_extract_dropout = feat_extract_dropout
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = conv_dim
+        self.conv_stride = conv_stride
+        self.conv_kernel = conv_kernel
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.intermediate_size = intermediate_size
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.num_adapter_layers = num_adapter_layers
+        self.adapter_stride = adapter_stride
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.scope = scope
+        self.tdnn_dim = tdnn_dim
+        self.tdnn_kernel = tdnn_kernel
+        self.tdnn_dilation = tdnn_dilation
+        self.xvector_output_dim = xvector_output_dim
+
+        output_seq_length = self.seq_length
+        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
+            output_seq_length = (output_seq_length - (kernel - 1)) / stride
+        self.output_seq_length = int(math.ceil(output_seq_length))
+        self.encoder_seq_length = self.output_seq_length
+
+        self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1
+
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+
+        return config, input_values, attention_mask
+
+    def get_config(self):
+        return Data2VecAudioConfig(
+            hidden_size=self.hidden_size,
+            feat_extract_dropout=self.feat_extract_dropout,
+            feat_extract_activation=self.feat_extract_activation,
+            conv_dim=self.conv_dim,
+            conv_stride=self.conv_stride,
+            conv_kernel=self.conv_kernel,
+            conv_bias=self.conv_bias,
+            mask_time_prob=self.mask_time_prob,
+            mask_time_length=self.mask_time_length,
+            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
+            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            intermediate_size=self.intermediate_size,
+            layer_norm_eps=self.layer_norm_eps,
+            hidden_act=self.hidden_act,
+            initializer_range=self.initializer_range,
+            vocab_size=self.vocab_size,
+            num_adapter_layers=self.num_adapter_layers,
+            adapter_stride=self.adapter_stride,
+            tdnn_dim=self.tdnn_dim,
+            tdnn_kernel=self.tdnn_kernel,
+            tdnn_dilation=self.tdnn_dilation,
+            xvector_output_dim=self.xvector_output_dim,
+        )
+
+    def create_and_check_model(self, config, input_values, attention_mask):
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
+        config.add_adapter = True
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask):
+        config.add_adapter = True
+        config.output_hidden_size = 8
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
+        )
+
+    def create_and_check_batch_inference(self, config, input_values, *args):
+        # test does not pass for models making use of `group_norm`
+        # check: https://github.com/pytorch/fairseq/issues/3227
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0.0
+
+        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
+
+        for i in range(input_values.shape[0]):
+            input_slice = input_values[i : i + 1, : input_lengths[i]]
+            output = model(input_slice).last_hidden_state
+
+            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
+            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
+
+    def check_ctc_loss(self, config, input_values, *args):
+        model = Data2VecAudioForCTC(config=config)
+        model.to(torch_device)
+
+        # make sure that dropout is disabled
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0
+
+        model.config.ctc_loss_reduction = "sum"
+        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+
+        model.config.ctc_loss_reduction = "mean"
+        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+
+        self.parent.assertTrue(isinstance(sum_loss, float))
+        self.parent.assertTrue(isinstance(mean_loss, float))
+
+    def check_seq_classifier_loss(self, config, input_values, *args):
+        model = Data2VecAudioForSequenceClassification(config=config)
+        model.to(torch_device)
+
+        # make sure that dropout is disabled
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0
+
+        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+        unmasked_loss = model(input_values, labels=labels).loss.item()
+
+        self.parent.assertTrue(isinstance(masked_loss, float))
+        self.parent.assertTrue(isinstance(unmasked_loss, float))
+        self.parent.assertTrue(masked_loss != unmasked_loss)
+
+    def check_ctc_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForCTC(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze feature encoder
+        model.freeze_feature_encoder()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+            if max_length_labels[i] < labels.shape[-1]:
+                # it's important that we make sure that target lenghts are at least
+                # one shorter than logit lenghts to prevent -inf
+                labels[i, max_length_labels[i] - 1 :] = -100
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_seq_classifier_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForSequenceClassification(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze everything but the classification head
+        model.freeze_base_model()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_xvector_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForXVector(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze everything but the classification head
+        model.freeze_base_model()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_labels_out_of_vocab(self, config, input_values, *args):
+        model = Data2VecAudioForCTC(config)
+        model.to(torch_device)
+        model.train()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)
+
+        with self.parent.assertRaises(ValueError):
+            model(input_values, labels=labels)
+
+    def prepare_config_and_inputs_for_common(self):
+        config, input_values, attention_mask = self.prepare_config_and_inputs()
+        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Data2VecAudioModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (Data2VecAudioForCTC, Data2VecAudioModel, Data2VecAudioForSequenceClassification)
+        if is_torch_available()
+        else ()
+    )
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = Data2VecAudioModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_adapter(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
+
+    def test_model_with_adapter_proj_dim(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
+
+    def test_ctc_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_loss(*config_and_inputs)
+
+    def test_seq_classifier_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
+
+    def test_ctc_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_training(*config_and_inputs)
+
+    def test_seq_classifier_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_training(*config_and_inputs)
+
+    def test_xvector_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_xvector_training(*config_and_inputs)
+
+    def test_labels_out_of_vocab(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
+
+    # Data2VecAudio has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # `input_ids` is renamed to `input_values`
+    def test_forward_signature(self):
+        pass
+
+    # Data2VecAudio cannot resize token embeddings
+    # since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # Data2VecAudio has no inputs_embeds
+    # and thus the `get_input_embeddings` fn
+    # is not implemented
+    def test_model_common_attributes(self):
+        pass
+
+    @is_pt_flax_cross_test
+    # non-robust architecture does not exist in Flax
+    def test_equivalence_flax_to_pt(self):
+        pass
+
+    @is_pt_flax_cross_test
+    # non-robust architecture does not exist in Flax
+    def test_equivalence_pt_to_flax(self):
+        pass
+
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        # set layer drop to 0
+        model.config.layerdrop = 0.0
+
+        input_values = inputs_dict["input_values"]
+
+        input_lengths = torch.tensor(
+            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
+        )
+        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
+
+        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
+        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
+        inputs_dict["labels"] = labels
+
+        outputs = model(**inputs_dict)
+
+        output = outputs[0]
+
+        # Encoder-/Decoder-only models
+        hidden_states = outputs.hidden_states[0]
+        attentions = outputs.attentions[0]
+
+        hidden_states.retain_grad()
+        attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+        self.assertIsNotNone(attentions.grad)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = [
+                    "conv.weight",
+                    "masked_spec_embed",
+                    "codevectors",
+                    "quantizer.weight_proj.weight",
+                    "project_hid.weight",
+                    "project_hid.bias",
+                    "project_q.weight",
+                    "project_q.bias",
+                    "feature_projection.projection.weight",
+                    "feature_projection.projection.bias",
+                    "objective.weight",
+                ]
+                if param.requires_grad:
+                    if any([x in name for x in uniform_init_parms]):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.data.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+        if hasattr(module, "codevectors") and module.codevectors is not None:
+            module.codevectors.data.fill_(3)
+        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
+            module.masked_spec_embed.data.fill_(3)
+
+    def test_mask_feature_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "patrickvonplaten/data2vec-base-960h", mask_feature_prob=0.2, mask_feature_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    def test_mask_time_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "patrickvonplaten/data2vec-base-960h", mask_time_prob=0.2, mask_time_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    @unittest.skip(reason="Feed forward chunking is not implemented")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = Data2VecAudioModel.from_pretrained("patrickvonplaten/data2vec-base")
+        self.assertIsNotNone(model)
+
+
+@require_torch
+class Data2VecAudioUtilsTest(unittest.TestCase):
+    def test_compute_mask_indices(self):
+        batch_size = 4
+        sequence_length = 60
+        mask_prob = 0.5
+        mask_length = 1
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
+
+    def test_compute_mask_indices_low_prob(self):
+        # with these settings num_masked_spans=0.5, which means probabilistic rounding
+        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
+        # the other 5 out of 10, cases num_masked_spans=1
+        n_trials = 100
+        batch_size = 4
+        sequence_length = 100
+        mask_prob = 0.05
+        mask_length = 10
+
+        count_dimensions_masked = 0
+        count_dimensions_not_masked = 0
+
+        for _ in range(n_trials):
+            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+            mask = torch.from_numpy(mask).to(torch_device)
+
+            num_masks = torch.sum(mask).item()
+
+            if num_masks > 0:
+                count_dimensions_masked += 1
+            else:
+                count_dimensions_not_masked += 1
+
+        # as we test for at least 10 masked dimension and at least
+        # 10 non-masked dimension, this test could fail with probability:
+        # P(100 coin flips, at most 9 heads) = 1.66e-18
+        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
+        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
+
+    def test_compute_mask_indices_overlap(self):
+        batch_size = 4
+        sequence_length = 80
+        mask_prob = 0.5
+        mask_length = 4
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
+        for batch_sum in mask.sum(axis=-1):
+            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
+
+    def test_compute_mask_indices_attn_mask_overlap(self):
+        batch_size = 4
+        sequence_length = 80
+        mask_prob = 0.5
+        mask_length = 4
+
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        attention_mask[:2, sequence_length // 2 :] = 0
+
+        mask = _compute_mask_indices(
+            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
+        )
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        for batch_sum in mask.sum(axis=-1):
+            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
+
+        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
+
+    def test_compute_mask_indices_short_audio(self):
+        batch_size = 4
+        sequence_length = 100
+        mask_prob = 0.05
+        mask_length = 10
+
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        # force one example to be heavily padded
+        attention_mask[0, 5:] = 0
+
+        mask = _compute_mask_indices(
+            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
+        )
+
+        # make sure that non-padded examples cannot be padded
+        self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any())
+
+    def test_compute_perplexity(self):
+        probs = torch.arange(100, device=torch_device).reshape(2, 5, 10) / 100
+
+        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs)
+        self.assertTrue(abs(ppl.item() - 141.4291) < 1e-3)
+
+        # mask half of the input
+        mask = torch.ones((2,), device=torch_device, dtype=torch.bool)
+        mask[0] = 0
+
+        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs, mask)
+        self.assertTrue(abs(ppl.item() - 58.6757) < 1e-3)
+
+    def test_sample_negatives(self):
+        batch_size = 2
+        sequence_length = 10
+        hidden_size = 4
+        num_negatives = 3
+
+        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
+            sequence_length, hidden_size
+        )  # each value in vector consits of same value
+        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
+
+        # sample negative indices
+        sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None)
+        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
+        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
+        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
+        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
+
+        # make sure no negatively sampled vector is actually a positive one
+        for negative in negatives:
+            self.assertTrue(((negative - features) == 0).sum() == 0.0)
+
+        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
+        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
+
+    def test_sample_negatives_with_mask(self):
+        batch_size = 2
+        sequence_length = 10
+        hidden_size = 4
+        num_negatives = 3
+
+        # second half of last input tensor is padded
+        mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        mask[-1, sequence_length // 2 :] = 0
+
+        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
+            sequence_length, hidden_size
+        )  # each value in vector consits of same value
+        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
+
+        # replace masked feature vectors with -100 to test that those are not sampled
+        features = torch.where(mask[:, :, None].expand(features.shape).bool(), features, -100)
+
+        # sample negative indices
+        sampled_negative_indices = _sample_negative_indices(
+            (batch_size, sequence_length), num_negatives, mask.cpu().numpy()
+        )
+        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
+        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
+        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
+
+        self.assertTrue((negatives >= 0).all().item())
+
+        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
+
+        # make sure no negatively sampled vector is actually a positive one
+        for negative in negatives:
+            self.assertTrue(((negative - features) == 0).sum() == 0.0)
+
+        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
+        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
+
+
+@require_torch
+@require_soundfile
+@slow
+class Data2VecAudioModelIntegrationTest(unittest.TestCase):
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").filter(
+            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
+        )[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def _load_superb(self, task, num_samples):
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+
+        return ds[:num_samples]
+
+    def test_inference_ctc_normal(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
+        model.to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+        input_speech = self._load_datasamples(1)
+
+        input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_ctc_normal_batched(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
+        model.to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+
+        input_speech = self._load_datasamples(2)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "a man said to the universe sir i exist",
+            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
+        ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_ctc_robust_batched(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+
+        input_speech = self._load_datasamples(4)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values, attention_mask=attention_mask).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "a man said to the universe sir i exist",
+            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
+            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
+            "his instant panic was followed by a small sharp blow high on his chest",
+        ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_keyword_spotting(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+        input_data = self._load_superb("ks", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
+
+        expected_labels = [7, 6, 10, 9]
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor([6.1186, 11.8961, 10.2931, 6.0898], device=torch_device)
+
+        self.assertListEqual(predicted_ids.tolist(), expected_labels)
+        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
+
+    def test_inference_intent_classification(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("ic", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+
+        predicted_logits_action, predicted_ids_action = torch.max(outputs.logits[:, :6], dim=-1)
+        predicted_logits_object, predicted_ids_object = torch.max(outputs.logits[:, 6:20], dim=-1)
+        predicted_logits_location, predicted_ids_location = torch.max(outputs.logits[:, 20:24], dim=-1)
+
+        expected_labels_action = [0, 0, 2, 3]
+        expected_logits_action = torch.tensor([0.4568, 11.0848, 1.6621, 9.3841], device=torch_device)
+        expected_labels_object = [3, 10, 3, 4]
+        expected_logits_object = torch.tensor([1.5322, 10.7094, 5.2469, 22.1318], device=torch_device)
+        expected_labels_location = [0, 0, 0, 1]
+        expected_logits_location = torch.tensor([1.5335, 6.5096, 10.5704, 11.0569], device=torch_device)
+
+        self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
+        self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
+        self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)
+
+        self.assertTrue(torch.allclose(predicted_logits_action, expected_logits_action, atol=1e-2))
+        self.assertTrue(torch.allclose(predicted_logits_object, expected_logits_object, atol=1e-2))
+        self.assertTrue(torch.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
+
+    def test_inference_speaker_identification(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("si", 4)
+
+        output_logits = []
+        with torch.no_grad():
+            for example in input_data["speech"]:
+                input = processor(example, return_tensors="pt", padding=True)
+                output = model(input.input_values.to(torch_device), attention_mask=None)
+                output_logits.append(output.logits[0])
+        output_logits = torch.stack(output_logits)
+        predicted_logits, predicted_ids = torch.max(output_logits, dim=-1)
+
+        expected_labels = [251, 1, 1, 3]
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor([37.5627, 71.6362, 64.2419, 31.7778], device=torch_device)
+
+        self.assertListEqual(predicted_ids.tolist(), expected_labels)
+        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
+
+    def test_inference_emotion_recognition(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("er", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
+
+        expected_labels = [1, 1, 2, 2]
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor([2.1722, 3.0779, 8.0287, 6.6797], device=torch_device)
+
+        self.assertListEqual(predicted_ids.tolist(), expected_labels)
+        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
+
+    def test_phoneme_recognition(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("superb/wav2vec2-base-superb-ic")
+
+        input_speech = self._load_datasamples(4)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values, attention_mask=attention_mask).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "ɐ m æ n s ɛ d t ə ð ə j uː n ɪ v ɚ s s ɚ aɪ ɛ ɡ z ɪ s t",
+            "s w ɛ t k ʌ v ɚ d b ɹ iː ɔ n z b ɑː d i t ɹ ɪ k l ɪ ŋ ɪ n t ə ð ə t aɪ t l oɪ n k l ɑː θ ð æ w ʌ z ð ɪ oʊ n l i ɡ ɑːɹ m ə n t h iː w ɔːɹ",
+            "ð ə k aɪ t ɔ n h ɪ z tʃ ɛ s t s t ɪ l d ɹ ɪ p ɪ ŋ b l ʌ d ð ɪ eɪ k ʌ v h ɪ z oʊ v ɚ s t ɹ eɪ n d aɪ z iː v ə n ð ə s ɔːɹ ɹ ɪ ŋ ɐ ɹ iː n ɐ ɚ ɹ aʊ n d h ɪ m w ɪ ð ə θ aʊ z ə n d z ʌ v s p ɛ k t eɪ ɾ ɚ z w ɜː t ɹ ɪ v ɪ æ l ᵻ ɾ i z n ɑː t w ɜː θ θ ɪ ŋ k ɪ ŋ ɐ b aʊ t",
+            "h ɪ z ɪ n s t ə n t v p æ n ɪ k w ʌ z f ɑː l oʊ d b aɪ ɐ s m ɔː l ʃ ɑːɹ p b l oʊ h aɪ ɔ n h ɪ z tʃ ɛ s t",
+        ]
+        # should correspond to =>:
+        # [
+        # "a man said to the universe sir i exist",
+        # "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
+        # "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
+        # "his instant panic was followed by a small sharp blow high on his chest",
+        # ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_diarization(self):
+        model = Data2VecAudioForAudioFrameClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("sd", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+        # labels is a one-hot array of shape (num_frames, num_speakers)
+        labels = (outputs.logits > 0).long()
+
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor(
+            [
+                [[-5.2807, -5.1272], [-5.4059, -4.7757], [-5.2764, -4.9621], [-5.0117, -4.5851]],
+                [[-1.7643, -0.5462], [-1.7369, -0.2649], [-1.5066, -0.6200], [-4.5703, -2.4863]],
+                [[-0.8656, -0.4783], [-0.8899, -0.3289], [-0.9267, -0.5781], [-0.7817, -0.4619]],
+                [[-4.8625, -2.5316], [-5.2339, -2.2155], [-4.9835, -2.0344], [-4.4727, -1.8421]],
+            ],
+            device=torch_device,
+        )
+        self.assertEqual(labels[0, :, 0].sum(), 555)
+        self.assertEqual(labels[0, :, 1].sum(), 299)
+        # TODO: update the tolerance after the CI moves to torch 1.10
+        self.assertTrue(torch.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2))
+
+    def test_inference_speaker_verification(self):
+        model = Data2VecAudioForXVector.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("si", 4)
+
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
+        labels = torch.tensor([5, 1, 1, 3], device=torch_device).T
+
+        with torch.no_grad():
+            input_values = inputs.input_values.to(torch_device)
+            attention_mask = inputs.attention_mask.to(torch_device)
+            outputs = model(input_values, attention_mask=attention_mask, labels=labels)
+        embeddings = torch.nn.functional.normalize(outputs.embeddings, dim=-1).cpu()
+
+        cosine_sim = torch.nn.CosineSimilarity(dim=-1)
+        # id10002 vs id10002
+        self.assertAlmostEqual(cosine_sim(embeddings[1], embeddings[2]).numpy(), 0.9758, 3)
+        # id10006 vs id10002
+        self.assertAlmostEqual(cosine_sim(embeddings[0], embeddings[1]).numpy(), 0.7579, 3)
+        # id10002 vs id10004
+        self.assertAlmostEqual(cosine_sim(embeddings[2], embeddings[3]).numpy(), 0.7594, 3)
+
+        # TODO: update the tolerance after the CI moves to torch 1.10
+        self.assertAlmostEqual(outputs.loss.item(), 17.7963, 2)
diff --git a/tests/test_modeling_data2vec_audio.py b/tests/test_modeling_data2vec_audio.py
index 8a39f3d11e789..cf2eade935012 100644
--- a/tests/test_modeling_data2vec_audio.py
+++ b/tests/test_modeling_data2vec_audio.py
@@ -55,7 +55,6 @@ def __init__(
         seq_length=1024,  # speech is longer
         is_training=False,
         hidden_size=16,
-        feat_extract_norm="group",
         feat_extract_dropout=0.0,
         feat_extract_activation="gelu",
         conv_dim=(32, 32, 32),
@@ -66,7 +65,7 @@ def __init__(
         num_conv_pos_embedding_groups=2,
         num_hidden_layers=4,
         num_attention_heads=2,
-        hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
+        hidden_dropout_prob=0.1,
         intermediate_size=20,
         layer_norm_eps=1e-5,
         hidden_act="gelu",
@@ -74,7 +73,6 @@ def __init__(
         mask_time_prob=0.5,
         mask_time_length=2,
         vocab_size=32,
-        do_stable_layer_norm=False,
         num_adapter_layers=1,
         adapter_stride=2,
         tdnn_dim=(32, 32),
@@ -88,7 +86,6 @@ def __init__(
         self.seq_length = seq_length
         self.is_training = is_training
         self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
         self.feat_extract_dropout = feat_extract_dropout
         self.feat_extract_activation = feat_extract_activation
         self.conv_dim = conv_dim
@@ -105,7 +102,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.vocab_size = vocab_size
-        self.do_stable_layer_norm = do_stable_layer_norm
         self.num_adapter_layers = num_adapter_layers
         self.adapter_stride = adapter_stride
         self.mask_time_prob = mask_time_prob
@@ -135,7 +131,6 @@ def prepare_config_and_inputs(self):
     def get_config(self):
         return Data2VecAudioConfig(
             hidden_size=self.hidden_size,
-            feat_extract_norm=self.feat_extract_norm,
             feat_extract_dropout=self.feat_extract_dropout,
             feat_extract_activation=self.feat_extract_activation,
             conv_dim=self.conv_dim,
@@ -151,7 +146,6 @@ def get_config(self):
             hidden_dropout_prob=self.hidden_dropout_prob,
             intermediate_size=self.intermediate_size,
             layer_norm_eps=self.layer_norm_eps,
-            do_stable_layer_norm=self.do_stable_layer_norm,
             hidden_act=self.hidden_act,
             initializer_range=self.initializer_range,
             vocab_size=self.vocab_size,
@@ -595,262 +589,6 @@ def test_model_from_pretrained(self):
         self.assertIsNotNone(model)
 
 
-@require_torch
-class Data2VecAudioRobustModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            Data2VecAudioForCTC,
-            Data2VecAudioModel,
-            Data2VecAudioForSequenceClassification,
-            Data2VecAudioForAudioFrameClassification,
-            Data2VecAudioForXVector,
-        )
-        if is_torch_available()
-        else ()
-    )
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = Data2VecAudioModelTester(
-            self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True
-        )
-        self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
-
-    def test_model_with_adapter_proj_dim(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
-
-    def test_batched_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_xvector_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_xvector_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # Data2VecAudio has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_values`
-    def test_forward_signature(self):
-        pass
-
-    # Data2VecAudio cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # Data2VecAudio has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
-    def test_model_common_attributes(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_values = inputs_dict["input_values"]
-
-        input_lengths = torch.tensor(
-            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    def test_mask_feature_prob_ctc(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "patrickvonplaten/data2vec-base-960h", mask_feature_prob=0.2, mask_feature_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 299, 32))
-
-    def test_mask_time_prob_ctc(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "patrickvonplaten/data2vec-base-960h", mask_time_prob=0.2, mask_time_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 299, 32))
-
-    def test_mask_time_feature_prob_ctc_single_batch(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "patrickvonplaten/data2vec-base-960h",
-            mask_time_prob=0.2,
-            mask_feature_prob=0.2,
-            mask_time_length=2,
-            mask_feature_length=2,
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (1, 1498, 32))
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = Data2VecAudioModel.from_pretrained("patrickvonplaten/data2vec-base")
-        self.assertIsNotNone(model)
-
-
 @require_torch
 class Data2VecAudioUtilsTest(unittest.TestCase):
     def test_compute_mask_indices(self):
diff --git a/tests/test_modeling_data2vec_text.py b/tests/test_modeling_data2vec_text.py
index ffdc88fbd8f74..203ad1609e6b0 100644
--- a/tests/test_modeling_data2vec_text.py
+++ b/tests/test_modeling_data2vec_text.py
@@ -339,7 +339,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class Data2VecModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             Data2VecTextForCausalLM,

From b55f326357028905ba9f33a2bc5ca5082f2b3d24 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 14:53:20 +0000
Subject: [PATCH 045/121] correct config auto

---
 src/transformers/models/auto/configuration_auto.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index fa96166f5b079..5c42cad65f2d2 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -91,7 +91,6 @@
         ("reformer", "ReformerConfig"),
         ("longformer", "LongformerConfig"),
         ("roberta", "RobertaConfig"),
-        ("data2vec", "Data2VecTextConfig"),
         ("deberta-v2", "DebertaV2Config"),
         ("deberta", "DebertaConfig"),
         ("flaubert", "FlaubertConfig"),
@@ -180,7 +179,8 @@
         ("xlnet", "XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xlm", "XLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("roberta", "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("data2vec", "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("data2vec-text", "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("data2vec-audio", "DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("distilbert", "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -274,7 +274,6 @@
         ("reformer", "Reformer"),
         ("longformer", "Longformer"),
         ("roberta", "RoBERTa"),
-        ("data2vec", "Data2Vec"),
         ("flaubert", "FlauBERT"),
         ("fsmt", "FairSeq Machine-Translation"),
         ("squeezebert", "SqueezeBERT"),

From 4ff05bb2e67595ebaf20ba3a1c5170119cf131b6 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 14:54:20 +0000
Subject: [PATCH 046/121] correct tok auto

---
 src/transformers/models/auto/tokenization_auto.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index cdf84c0aad532..0c953f1636bf5 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -108,7 +108,6 @@
             ("bart", ("BartTokenizer", "BartTokenizerFast")),
             ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
             ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
-            ("data2vec", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "reformer",
                 (

From f216196231d818290c567dfc58b6045c67994419 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 15:55:57 +0100
Subject: [PATCH 047/121] Update utils/tests_fetcher.py

---
 utils/tests_fetcher.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index dd36390c84442..e842055d93bbd 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -345,7 +345,6 @@ def module_to_test_file(module_fname):
     "tests/sagemaker/test_single_node_gpu.py",  # SageMaker test
     "tests/sagemaker/test_multi_node_model_parallel.py",  # SageMaker test
     "tests/sagemaker/test_multi_node_data_parallel.py",  # SageMaker test
-    "tests/test_modeling_data2vec.py",  # TODO:Remove redundant file
 ]
 
 

From c98558e2400947f50b7111b19c0c56703ce8fc36 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 14:57:32 +0000
Subject: [PATCH 048/121] delete unnecessary files

---
 .../models/data2vec/configuration_data2vec.py      | 14 --------------
 1 file changed, 14 deletions(-)
 delete mode 100644 src/transformers/models/data2vec/configuration_data2vec.py

diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
deleted file mode 100644
index b0b4464ad1170..0000000000000
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.

From 2d260f23cfac0e4d5b1abea3ac16748f79dbb79f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 14:57:39 +0000
Subject: [PATCH 049/121] delete unnecessary files

---
 tests/test_modeling_data2vec_audio.py | 1025 -------------------------
 tests/test_modeling_data2vec_text.py  |  533 -------------
 2 files changed, 1558 deletions(-)
 delete mode 100644 tests/test_modeling_data2vec_audio.py
 delete mode 100644 tests/test_modeling_data2vec_text.py

diff --git a/tests/test_modeling_data2vec_audio.py b/tests/test_modeling_data2vec_audio.py
deleted file mode 100644
index cf2eade935012..0000000000000
--- a/tests/test_modeling_data2vec_audio.py
+++ /dev/null
@@ -1,1025 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Data2VecAudio model. """
-
-import math
-import unittest
-
-import numpy as np
-from datasets import load_dataset
-
-from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
-from transformers import Data2VecAudioConfig, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_soundfile, require_torch, slow, torch_device
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, _config_zero_init
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        Data2VecAudioForAudioFrameClassification,
-        Data2VecAudioForCTC,
-        Data2VecAudioForSequenceClassification,
-        Data2VecAudioForXVector,
-        Data2VecAudioModel,
-        Wav2Vec2FeatureExtractor,
-        Wav2Vec2Processor,
-    )
-    from transformers.models.wav2vec2.modeling_wav2vec2 import (
-        Wav2Vec2GumbelVectorQuantizer,
-        _compute_mask_indices,
-        _sample_negative_indices,
-    )
-
-
-class Data2VecAudioModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=1024,  # speech is longer
-        is_training=False,
-        hidden_size=16,
-        feat_extract_dropout=0.0,
-        feat_extract_activation="gelu",
-        conv_dim=(32, 32, 32),
-        conv_stride=(4, 4, 4),
-        conv_kernel=(8, 8, 8),
-        conv_bias=False,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
-        num_attention_heads=2,
-        hidden_dropout_prob=0.1,
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        mask_time_prob=0.5,
-        mask_time_length=2,
-        vocab_size=32,
-        num_adapter_layers=1,
-        adapter_stride=2,
-        tdnn_dim=(32, 32),
-        tdnn_kernel=(5, 3),
-        tdnn_dilation=(1, 2),
-        xvector_output_dim=32,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.num_adapter_layers = num_adapter_layers
-        self.adapter_stride = adapter_stride
-        self.mask_time_prob = mask_time_prob
-        self.mask_time_length = mask_time_length
-        self.scope = scope
-        self.tdnn_dim = tdnn_dim
-        self.tdnn_kernel = tdnn_kernel
-        self.tdnn_dilation = tdnn_dilation
-        self.xvector_output_dim = xvector_output_dim
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-        self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        return config, input_values, attention_mask
-
-    def get_config(self):
-        return Data2VecAudioConfig(
-            hidden_size=self.hidden_size,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            mask_time_prob=self.mask_time_prob,
-            mask_time_length=self.mask_time_length,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            num_adapter_layers=self.num_adapter_layers,
-            adapter_stride=self.adapter_stride,
-            tdnn_dim=self.tdnn_dim,
-            tdnn_kernel=self.tdnn_kernel,
-            tdnn_dilation=self.tdnn_dilation,
-            xvector_output_dim=self.xvector_output_dim,
-        )
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 8
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
-        )
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = Data2VecAudioForCTC(config=config)
-        model.to(torch_device)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_seq_classifier_loss(self, config, input_values, *args):
-        model = Data2VecAudioForSequenceClassification(config=config)
-        model.to(torch_device)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-        unmasked_loss = model(input_values, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(masked_loss, float))
-        self.parent.assertTrue(isinstance(unmasked_loss, float))
-        self.parent.assertTrue(masked_loss != unmasked_loss)
-
-    def check_ctc_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Data2VecAudioForCTC(config=config)
-        model.to(torch_device)
-        model.train()
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_seq_classifier_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Data2VecAudioForSequenceClassification(config=config)
-        model.to(torch_device)
-        model.train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_xvector_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Data2VecAudioForXVector(config=config)
-        model.to(torch_device)
-        model.train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = Data2VecAudioForCTC(config)
-        model.to(torch_device)
-        model.train()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)
-
-        with self.parent.assertRaises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class Data2VecAudioModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (Data2VecAudioForCTC, Data2VecAudioModel, Data2VecAudioForSequenceClassification)
-        if is_torch_available()
-        else ()
-    )
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = Data2VecAudioModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
-
-    def test_model_with_adapter_proj_dim(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_xvector_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_xvector_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # Data2VecAudio has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_values`
-    def test_forward_signature(self):
-        pass
-
-    # Data2VecAudio cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # Data2VecAudio has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
-    def test_model_common_attributes(self):
-        pass
-
-    @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_pt_to_flax(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_values = inputs_dict["input_values"]
-
-        input_lengths = torch.tensor(
-            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    def test_mask_feature_prob_ctc(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "patrickvonplaten/data2vec-base-960h", mask_feature_prob=0.2, mask_feature_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 299, 32))
-
-    def test_mask_time_prob_ctc(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "patrickvonplaten/data2vec-base-960h", mask_time_prob=0.2, mask_time_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 299, 32))
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = Data2VecAudioModel.from_pretrained("patrickvonplaten/data2vec-base")
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class Data2VecAudioUtilsTest(unittest.TestCase):
-    def test_compute_mask_indices(self):
-        batch_size = 4
-        sequence_length = 60
-        mask_prob = 0.5
-        mask_length = 1
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = torch.from_numpy(mask).to(torch_device)
-
-        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
-
-    def test_compute_mask_indices_low_prob(self):
-        # with these settings num_masked_spans=0.5, which means probabilistic rounding
-        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
-        # the other 5 out of 10, cases num_masked_spans=1
-        n_trials = 100
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        count_dimensions_masked = 0
-        count_dimensions_not_masked = 0
-
-        for _ in range(n_trials):
-            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-            mask = torch.from_numpy(mask).to(torch_device)
-
-            num_masks = torch.sum(mask).item()
-
-            if num_masks > 0:
-                count_dimensions_masked += 1
-            else:
-                count_dimensions_not_masked += 1
-
-        # as we test for at least 10 masked dimension and at least
-        # 10 non-masked dimension, this test could fail with probability:
-        # P(100 coin flips, at most 9 heads) = 1.66e-18
-        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
-        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
-
-    def test_compute_mask_indices_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = torch.from_numpy(mask).to(torch_device)
-
-        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-    def test_compute_mask_indices_attn_mask_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
-        attention_mask[:2, sequence_length // 2 :] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
-        )
-        mask = torch.from_numpy(mask).to(torch_device)
-
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
-
-    def test_compute_mask_indices_short_audio(self):
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
-        # force one example to be heavily padded
-        attention_mask[0, 5:] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
-        )
-
-        # make sure that non-padded examples cannot be padded
-        self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any())
-
-    def test_compute_perplexity(self):
-        probs = torch.arange(100, device=torch_device).reshape(2, 5, 10) / 100
-
-        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs)
-        self.assertTrue(abs(ppl.item() - 141.4291) < 1e-3)
-
-        # mask half of the input
-        mask = torch.ones((2,), device=torch_device, dtype=torch.bool)
-        mask[0] = 0
-
-        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs, mask)
-        self.assertTrue(abs(ppl.item() - 58.6757) < 1e-3)
-
-    def test_sample_negatives(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
-            sequence_length, hidden_size
-        )  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None)
-        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
-    def test_sample_negatives_with_mask(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        # second half of last input tensor is padded
-        mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
-        mask[-1, sequence_length // 2 :] = 0
-
-        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
-            sequence_length, hidden_size
-        )  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
-
-        # replace masked feature vectors with -100 to test that those are not sampled
-        features = torch.where(mask[:, :, None].expand(features.shape).bool(), features, -100)
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices(
-            (batch_size, sequence_length), num_negatives, mask.cpu().numpy()
-        )
-        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-
-        self.assertTrue((negatives >= 0).all().item())
-
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
-
-@require_torch
-@require_soundfile
-@slow
-class Data2VecAudioModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
-
-        return ds[:num_samples]
-
-    def test_inference_ctc_normal(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
-        model.to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
-        input_speech = self._load_datasamples(1)
-
-        input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_normal_batched(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
-        model.to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_robust_batched(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values, attention_mask=attention_mask).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
-            "his instant panic was followed by a small sharp blow high on his chest",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_keyword_spotting(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
-        input_data = self._load_superb("ks", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
-
-        expected_labels = [7, 6, 10, 9]
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor([6.1186, 11.8961, 10.2931, 6.0898], device=torch_device)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_inference_intent_classification(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("ic", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-
-        predicted_logits_action, predicted_ids_action = torch.max(outputs.logits[:, :6], dim=-1)
-        predicted_logits_object, predicted_ids_object = torch.max(outputs.logits[:, 6:20], dim=-1)
-        predicted_logits_location, predicted_ids_location = torch.max(outputs.logits[:, 20:24], dim=-1)
-
-        expected_labels_action = [0, 0, 2, 3]
-        expected_logits_action = torch.tensor([0.4568, 11.0848, 1.6621, 9.3841], device=torch_device)
-        expected_labels_object = [3, 10, 3, 4]
-        expected_logits_object = torch.tensor([1.5322, 10.7094, 5.2469, 22.1318], device=torch_device)
-        expected_labels_location = [0, 0, 0, 1]
-        expected_logits_location = torch.tensor([1.5335, 6.5096, 10.5704, 11.0569], device=torch_device)
-
-        self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
-        self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
-        self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)
-
-        self.assertTrue(torch.allclose(predicted_logits_action, expected_logits_action, atol=1e-2))
-        self.assertTrue(torch.allclose(predicted_logits_object, expected_logits_object, atol=1e-2))
-        self.assertTrue(torch.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
-
-    def test_inference_speaker_identification(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("si", 4)
-
-        output_logits = []
-        with torch.no_grad():
-            for example in input_data["speech"]:
-                input = processor(example, return_tensors="pt", padding=True)
-                output = model(input.input_values.to(torch_device), attention_mask=None)
-                output_logits.append(output.logits[0])
-        output_logits = torch.stack(output_logits)
-        predicted_logits, predicted_ids = torch.max(output_logits, dim=-1)
-
-        expected_labels = [251, 1, 1, 3]
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor([37.5627, 71.6362, 64.2419, 31.7778], device=torch_device)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_inference_emotion_recognition(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("er", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
-
-        expected_labels = [1, 1, 2, 2]
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor([2.1722, 3.0779, 8.0287, 6.6797], device=torch_device)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_phoneme_recognition(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("superb/wav2vec2-base-superb-ic")
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values, attention_mask=attention_mask).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "ɐ m æ n s ɛ d t ə ð ə j uː n ɪ v ɚ s s ɚ aɪ ɛ ɡ z ɪ s t",
-            "s w ɛ t k ʌ v ɚ d b ɹ iː ɔ n z b ɑː d i t ɹ ɪ k l ɪ ŋ ɪ n t ə ð ə t aɪ t l oɪ n k l ɑː θ ð æ w ʌ z ð ɪ oʊ n l i ɡ ɑːɹ m ə n t h iː w ɔːɹ",
-            "ð ə k aɪ t ɔ n h ɪ z tʃ ɛ s t s t ɪ l d ɹ ɪ p ɪ ŋ b l ʌ d ð ɪ eɪ k ʌ v h ɪ z oʊ v ɚ s t ɹ eɪ n d aɪ z iː v ə n ð ə s ɔːɹ ɹ ɪ ŋ ɐ ɹ iː n ɐ ɚ ɹ aʊ n d h ɪ m w ɪ ð ə θ aʊ z ə n d z ʌ v s p ɛ k t eɪ ɾ ɚ z w ɜː t ɹ ɪ v ɪ æ l ᵻ ɾ i z n ɑː t w ɜː θ θ ɪ ŋ k ɪ ŋ ɐ b aʊ t",
-            "h ɪ z ɪ n s t ə n t v p æ n ɪ k w ʌ z f ɑː l oʊ d b aɪ ɐ s m ɔː l ʃ ɑːɹ p b l oʊ h aɪ ɔ n h ɪ z tʃ ɛ s t",
-        ]
-        # should correspond to =>:
-        # [
-        # "a man said to the universe sir i exist",
-        # "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-        # "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
-        # "his instant panic was followed by a small sharp blow high on his chest",
-        # ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_diarization(self):
-        model = Data2VecAudioForAudioFrameClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("sd", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        # labels is a one-hot array of shape (num_frames, num_speakers)
-        labels = (outputs.logits > 0).long()
-
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor(
-            [
-                [[-5.2807, -5.1272], [-5.4059, -4.7757], [-5.2764, -4.9621], [-5.0117, -4.5851]],
-                [[-1.7643, -0.5462], [-1.7369, -0.2649], [-1.5066, -0.6200], [-4.5703, -2.4863]],
-                [[-0.8656, -0.4783], [-0.8899, -0.3289], [-0.9267, -0.5781], [-0.7817, -0.4619]],
-                [[-4.8625, -2.5316], [-5.2339, -2.2155], [-4.9835, -2.0344], [-4.4727, -1.8421]],
-            ],
-            device=torch_device,
-        )
-        self.assertEqual(labels[0, :, 0].sum(), 555)
-        self.assertEqual(labels[0, :, 1].sum(), 299)
-        # TODO: update the tolerance after the CI moves to torch 1.10
-        self.assertTrue(torch.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2))
-
-    def test_inference_speaker_verification(self):
-        model = Data2VecAudioForXVector.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("si", 4)
-
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
-        labels = torch.tensor([5, 1, 1, 3], device=torch_device).T
-
-        with torch.no_grad():
-            input_values = inputs.input_values.to(torch_device)
-            attention_mask = inputs.attention_mask.to(torch_device)
-            outputs = model(input_values, attention_mask=attention_mask, labels=labels)
-        embeddings = torch.nn.functional.normalize(outputs.embeddings, dim=-1).cpu()
-
-        cosine_sim = torch.nn.CosineSimilarity(dim=-1)
-        # id10002 vs id10002
-        self.assertAlmostEqual(cosine_sim(embeddings[1], embeddings[2]).numpy(), 0.9758, 3)
-        # id10006 vs id10002
-        self.assertAlmostEqual(cosine_sim(embeddings[0], embeddings[1]).numpy(), 0.7579, 3)
-        # id10002 vs id10004
-        self.assertAlmostEqual(cosine_sim(embeddings[2], embeddings[3]).numpy(), 0.7594, 3)
-
-        # TODO: update the tolerance after the CI moves to torch 1.10
-        self.assertAlmostEqual(outputs.loss.item(), 17.7963, 2)
diff --git a/tests/test_modeling_data2vec_text.py b/tests/test_modeling_data2vec_text.py
deleted file mode 100644
index 203ad1609e6b0..0000000000000
--- a/tests/test_modeling_data2vec_text.py
+++ /dev/null
@@ -1,533 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Data2VecText model. """
-
-
-import unittest
-
-from transformers import Data2VecTextConfig, is_torch_available
-from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
-
-from .test_configuration_common import ConfigTester
-from .test_generation_utils import GenerationTesterMixin
-from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        Data2VecTextForCausalLM,
-        Data2VecTextForMaskedLM,
-        Data2VecTextForMultipleChoice,
-        Data2VecTextForQuestionAnswering,
-        Data2VecTextForSequenceClassification,
-        Data2VecTextForTokenClassification,
-        Data2VecTextModel,
-    )
-    from transformers.models.data2vec.modeling_data2vec import (
-        DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
-        Data2VecTextForTextEmbeddings,
-        create_position_ids_from_input_ids,
-    )
-
-
-class Data2VecModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 5
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return Data2VecTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecTextModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Data2VecTextModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Data2VecTextForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Data2VecTextForCausalLM(config=config).to(torch_device).eval()
-
-        # make sure that ids don't start with pad token
-        mask = input_ids.ne(config.pad_token_id).long()
-        input_ids = input_ids * mask
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-
-        # make sure that ids don't start with pad token
-        mask = next_tokens.ne(config.pad_token_id).long()
-        next_tokens = next_tokens * mask
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecTextForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = Data2VecTextForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = Data2VecTextForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecTextForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class Data2VecModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            Data2VecTextForCausalLM,
-            Data2VecTextForMaskedLM,
-            Data2VecTextModel,
-            Data2VecTextForSequenceClassification,
-            Data2VecTextForTokenClassification,
-            Data2VecTextForMultipleChoice,
-            Data2VecTextForQuestionAnswering,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (Data2VecTextForCausalLM,) if is_torch_available() else ()
-
-    def setUp(self):
-        self.model_tester = Data2VecModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Data2VecTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = Data2VecTextModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    def test_create_position_ids_respects_padding_index(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = Data2VecTextForTextEmbeddings(config=config)
-
-        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = torch.as_tensor(
-            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
-        )
-
-        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-    def test_create_position_ids_from_inputs_embeds(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = Data2VecTextForTextEmbeddings(config=config)
-
-        inputs_embeds = torch.empty(2, 4, 30)
-        expected_single_positions = [
-            0 + embeddings.padding_idx + 1,
-            1 + embeddings.padding_idx + 1,
-            2 + embeddings.padding_idx + 1,
-            3 + embeddings.padding_idx + 1,
-        ]
-        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
-        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-
-@require_torch
-class Data2VecModelIntegrationTest(TestCasePlus):
-    @slow
-    def test_inference_masked_lm(self):
-        model = Data2VecTextForMaskedLM.from_pretrained("data2vec")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
-        )
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
-        # data2vec.eval()
-        # expected_slice = data2vec.model.forward(input_ids)[0][:, :3, :3].detach()
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_no_head(self):
-        model = Data2VecTextModel.from_pretrained("data2vec")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
-        )
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
-        # data2vec.eval()
-        # expected_slice = data2vec.extract_features(input_ids)[:, :3, :3].detach()
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_classification_head(self):
-        model = Data2VecTextForSequenceClassification.from_pretrained("data2vec-large-mnli")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 3))
-        self.assertEqual(output.shape, expected_shape)
-        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.large.mnli')
-        # data2vec.eval()
-        # expected_tensor = data2vec.predict("mnli", input_ids, return_logits=True).detach()
-
-        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))

From fee4f8dfcdccb83213afb8ba263e5a51ed27a544 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 15:10:20 +0000
Subject: [PATCH 050/121] further renaming

---
 .../data2vec/configuration_data2vec_audio.py  | 42 +-----------
 .../data2vec/configuration_data2vec_text.py   |  6 +-
 .../data2vec/modeling_data2vec_audio.py       | 68 -------------------
 .../models/data2vec/modeling_data2vec_text.py | 26 +++----
 4 files changed, 19 insertions(+), 123 deletions(-)

diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py
index 352ebe23ab5cd..71d455702e639 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_audio.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -33,7 +33,7 @@ class Data2VecAudioConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`Data2VecAudioModel`]. It is used to instantiate
     an Data2VecAudio model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Data2VecAudio
-    [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h) architecture.
+    [facebook/data2vec-audio-base-960h](https://huggingface.co/facebook/data2vec-audio-base-960h) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -71,8 +71,6 @@ class Data2VecAudioConfig(PretrainedConfig):
         feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the 1D convolutional layers of the feature
             extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for quantized feature encoder states.
         conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
             A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
             feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
@@ -114,22 +112,6 @@ class Data2VecAudioConfig(PretrainedConfig):
             The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
             step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
-        num_codevectors_per_group (`int`, *optional*, defaults to 320):
-            Number of entries in each quantization codebook (group).
-        num_codevector_groups (`int`, *optional*, defaults to 2):
-            Number of codevector groups for product codevector quantization.
-        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
-            The temperature *kappa* in the contrastive loss.
-        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
-        num_negatives (`int`, *optional*, defaults to 100):
-            Number of negative samples for the contrastive loss.
-        codevector_dim (`int`, *optional*, defaults to 256):
-            Dimensionality of the quantized feature vectors.
-        proj_codevector_dim (`int`, *optional*, defaults to 256):
-            Dimensionality of the final projection of both the quantized and the transformer features.
-        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
-            The weight of the codebook diversity loss component.
         ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
             Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
             instance of [`Data2VecAudioForCTC`].
@@ -172,10 +154,10 @@ class Data2VecAudioConfig(PretrainedConfig):
     ```python
     >>> from transformers import Data2VecAudioModel, Data2VecAudioConfig
 
-    >>> # Initializing a Data2VecAudio facebook/wav2vec2-base-960h style configuration
+    >>> # Initializing a Data2VecAudio facebook/data2vec-audio-base-960h style configuration
     >>> configuration = Data2VecAudioConfig()
 
-    >>> # Initializing a model from the facebook/wav2vec2-base-960h style configuration
+    >>> # Initializing a model from the facebook/data2vec-audio-base-960h style configuration
     >>> model = Data2VecAudioModel(configuration)
 
     >>> # Accessing the model configuration
@@ -195,7 +177,6 @@ def __init__(
         activation_dropout=0.1,
         attention_dropout=0.1,
         feat_proj_dropout=0.0,
-        feat_quantizer_dropout=0.0,
         final_dropout=0.1,
         layerdrop=0.1,
         initializer_range=0.02,
@@ -214,13 +195,6 @@ def __init__(
         mask_feature_prob=0.0,
         mask_feature_length=10,
         mask_feature_min_masks=0,
-        num_codevectors_per_group=320,
-        num_codevector_groups=2,
-        contrastive_logits_temperature=0.1,
-        num_negatives=100,
-        codevector_dim=256,
-        proj_codevector_dim=256,
-        diversity_loss_weight=0.1,
         ctc_loss_reduction="sum",
         ctc_zero_infinity=False,
         use_weighted_layer_sum=False,
@@ -285,16 +259,6 @@ def __init__(
         self.mask_feature_length = mask_feature_length
         self.mask_feature_min_masks = mask_feature_min_masks
 
-        # parameters for pretraining with codevector quantized representations
-        self.num_codevectors_per_group = num_codevectors_per_group
-        self.num_codevector_groups = num_codevector_groups
-        self.contrastive_logits_temperature = contrastive_logits_temperature
-        self.feat_quantizer_dropout = feat_quantizer_dropout
-        self.num_negatives = num_negatives
-        self.codevector_dim = codevector_dim
-        self.proj_codevector_dim = proj_codevector_dim
-        self.diversity_loss_weight = diversity_loss_weight
-
         # ctc loss
         self.ctc_loss_reduction = ctc_loss_reduction
         self.ctc_zero_infinity = ctc_zero_infinity
diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py
index 3c7d6765dd314..8cf165ced3db2 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_text.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_text.py
@@ -41,8 +41,8 @@ class Data2VecTextConfig(PretrainedConfig):
 
     Args:
         vocab_size (`int`, *optional*, defaults to 30522):
-            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+            Vocabulary size of the DATA2VEC model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`Data2VecModel`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -62,7 +62,7 @@ class Data2VecTextConfig(PretrainedConfig):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+            The vocabulary size of the `token_type_ids` passed when calling [`Data2VecModel`].
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 78e071a0064ef..b66430328aa66 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -260,42 +260,6 @@ def compute_num_masked_span(input_length):
     return spec_aug_mask
 
 
-def _sample_negative_indices(
-    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
-):
-    """
-    Sample `num_negatives` vectors from feature vectors.
-    """
-    batch_size, sequence_length = features_shape
-
-    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
-    sequence_length_range = np.arange(sequence_length)
-
-    # get `num_negatives` random vector indices from the same utterance
-    sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
-
-    mask_time_indices = (
-        mask_time_indices.astype(np.bool) if mask_time_indices is not None else np.ones(features_shape, dtype=np.bool)
-    )
-
-    for batch_idx in range(batch_size):
-        high = mask_time_indices[batch_idx].sum() - 1
-        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
-
-        feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
-        sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
-        # avoid sampling the same positive vector, but keep the distribution uniform
-        sampled_indices[sampled_indices >= feature_indices] += 1
-
-        # remap to actual indices
-        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
-
-        # correct for batch size
-        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
-
-    return sampled_negative_indices
-
-
 class Data2VecAudioConvLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -636,38 +600,6 @@ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
         return outputs
 
 
-class Data2VecAudioEncoderLayerStableLayerNorm(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.attention = Data2VecAudioAttention(
-            embed_dim=config.hidden_size,
-            num_heads=config.num_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=False,
-        )
-        self.dropout = nn.Dropout(config.hidden_dropout)
-        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.feed_forward = Data2VecAudioFeedForward(config)
-        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
-        attn_residual = hidden_states
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states, attn_weights, _ = self.attention(
-            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
-        )
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = attn_residual + hidden_states
-        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs
-
-
 class Data2VecAudioEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 720e41481ffd4..2aa6db6de3fba 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -593,7 +593,7 @@ class Data2VecTextPreTrainedModel(PreTrainedModel):
     """
 
     config_class = Data2VecTextConfig
-    base_model_prefix = "data2vec"
+    base_model_prefix = "data2vec_text"
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
@@ -894,7 +894,7 @@ def __init__(self, config):
         if not config.is_decoder:
             logger.warning("If you want to use `Data2VecTextLMHeadModel` as a standalone, add `is_decoder=True.`")
 
-        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
         self.lm_head = Data2VecTextLMHead(config)
 
         # The LM head weights require special treatment only when they are tied with the word embeddings
@@ -975,7 +975,7 @@ def forward(
         if labels is not None:
             use_cache = False
 
-        outputs = self.data2vec(
+        outputs = self.data2vec_text(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1049,7 +1049,7 @@ def __init__(self, config):
                 "bi-directional self-attention."
             )
 
-        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
         self.lm_head = Data2VecTextLMHead(config)
 
         # The LM head weights require special treatment only when they are tied with the word embeddings
@@ -1097,7 +1097,7 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.data2vec(
+        outputs = self.data2vec_text(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1173,7 +1173,7 @@ def __init__(self, config):
         self.num_labels = config.num_labels
         self.config = config
 
-        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
         self.classifier = Data2VecTextClassificationHead(config)
 
         # Initialize weights and apply final processing
@@ -1207,7 +1207,7 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.data2vec(
+        outputs = self.data2vec_text(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1269,7 +1269,7 @@ class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
-        self.data2vec = Data2VecTextModel(config)
+        self.data2vec_text = Data2VecTextModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, 1)
 
@@ -1317,7 +1317,7 @@ def forward(
             else None
         )
 
-        outputs = self.data2vec(
+        outputs = self.data2vec_text(
             flat_input_ids,
             position_ids=flat_position_ids,
             token_type_ids=flat_token_type_ids,
@@ -1366,7 +1366,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
         classifier_dropout = (
             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
         )
@@ -1402,7 +1402,7 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.data2vec(
+        outputs = self.data2vec_text(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1474,7 +1474,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
         # Initialize weights and apply final processing
@@ -1513,7 +1513,7 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.data2vec(
+        outputs = self.data2vec_text(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,

From d0a7cf95aa7e3d5737259988ad1a6580c6813042 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 16:58:03 +0000
Subject: [PATCH 051/121] make all tests pass

---
 src/transformers/__init__.py                  |   2 +
 src/transformers/models/auto/__init__.py      |   2 +
 ..._original_pytorch_checkpoint_to_pytorch.py |  19 +-
 .../data2vec/modeling_data2vec_audio.py       |   6 +-
 .../models/data2vec/modeling_data2vec_text.py |   6 +-
 tests/test_modeling_common.py                 |   5 +-
 tests/test_modeling_data2vec.py               | 351 ++----------------
 7 files changed, 45 insertions(+), 346 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 02947f3fdf286..ea6846d042584 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -669,6 +669,7 @@
     _import_structure["models.auto"].extend(
         [
             "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
+            "MODEL_FOR_AUDIO_XVECTOR_MAPPING",
             "MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
             "MODEL_FOR_CAUSAL_LM_MAPPING",
             "MODEL_FOR_CTC_MAPPING",
@@ -2859,6 +2860,7 @@
         )
         from .models.auto import (
             MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
+            MODEL_FOR_AUDIO_XVECTOR_MAPPING,
             MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
             MODEL_FOR_CAUSAL_LM_MAPPING,
             MODEL_FOR_CTC_MAPPING,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 18701a42757cf..9a0e73fc24017 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -32,6 +32,7 @@
 if is_torch_available():
     _import_structure["modeling_auto"] = [
         "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
+        "MODEL_FOR_AUDIO_XVECTOR_MAPPING",
         "MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
         "MODEL_FOR_CAUSAL_LM_MAPPING",
         "MODEL_FOR_CTC_MAPPING",
@@ -150,6 +151,7 @@
     if is_torch_available():
         from .modeling_auto import (
             MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
+            MODEL_FOR_AUDIO_XVECTOR_MAPPING,
             MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
             MODEL_FOR_CAUSAL_LM_MAPPING,
             MODEL_FOR_CTC_MAPPING,
diff --git a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
index 5dff263d7ab55..8659e36d9f483 100644
--- a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
@@ -32,10 +32,11 @@
     BertSelfAttention,
     BertSelfOutput,
 )
-from transformers.utils import logging
 
+# IMPORTANT: In order for this script to run, please make sure to download the dictionary: `dict.txt` from wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
 # File copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
-from .data2vec_text import Data2VecTextModel
+from transformers.models.data2vec.data2vec_text import Data2VecTextModel
+from transformers.utils import logging
 
 
 if version.parse(fairseq.__version__) < version.parse("0.9.0"):
@@ -80,17 +81,17 @@ def convert_data2vec_checkpoint_to_pytorch(
 
     # Now let's copy all the weights.
     # Embeddings
-    model.data2vec.embeddings.word_embeddings.weight = data2vec_sent_encoder.embed_tokens.weight
-    model.data2vec.embeddings.position_embeddings.weight = data2vec_sent_encoder.embed_positions.weight
-    model.data2vec.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
-        model.data2vec.embeddings.token_type_embeddings.weight
+    model.data2vec_text.embeddings.word_embeddings.weight = data2vec_sent_encoder.embed_tokens.weight
+    model.data2vec_text.embeddings.position_embeddings.weight = data2vec_sent_encoder.embed_positions.weight
+    model.data2vec_text.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
+        model.data2vec_text.embeddings.token_type_embeddings.weight
     )  # just zero them out b/c data2vec doesn't use them.
-    model.data2vec.embeddings.LayerNorm.weight = data2vec_sent_encoder.layernorm_embedding.weight
-    model.data2vec.embeddings.LayerNorm.bias = data2vec_sent_encoder.layernorm_embedding.bias
+    model.data2vec_text.embeddings.LayerNorm.weight = data2vec_sent_encoder.layernorm_embedding.weight
+    model.data2vec_text.embeddings.LayerNorm.bias = data2vec_sent_encoder.layernorm_embedding.bias
 
     for i in range(config.num_hidden_layers):
         # Encoder: start of layer
-        layer: BertLayer = model.data2vec.encoder.layer[i]
+        layer: BertLayer = model.data2vec_text.encoder.layer[i]
         data2vec_layer: TransformerSentenceEncoderLayer = data2vec_sent_encoder.layers[i]
 
         # self attention
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index b66430328aa66..90b8dce841e1d 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -72,10 +72,10 @@
 
 
 DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/data2vec-audio-base",
+    "facebook/data2vec-audio-base-10m",
+    "facebook/data2vec-audio-base-100h",
     "facebook/data2vec-audio-base-960h",
-    "facebook/data2vec-audio-large-960h",
-    "facebook/data2vec-audio-large-960h-lv60",
-    "facebook/data2vec-audio-large-960h-lv60-self",
     # See all Data2VecAudio models at https://huggingface.co/models?filter=data2vec-audio
 ]
 
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 2aa6db6de3fba..ecb867d637617 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -54,7 +54,7 @@
 
 _HIDDEN_STATES_START_POSITION = 2
 
-_CHECKPOINT_FOR_DOC = "data2vec"
+_CHECKPOINT_FOR_DOC = "facebook/data2vec-text-base"
 _CONFIG_FOR_DOC = "Data2VecTextConfig"
 _TOKENIZER_FOR_DOC = "RobertaTokenizer"
 
@@ -67,8 +67,8 @@
 _CTC_EXPECTED_LOSS = 53.48
 
 DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "data2vec",
-    # See all data2vec models at https://huggingface.co/models?filter=data2vec
+    "facebook/data2vec-text-base",
+    # See all data2vec models at https://huggingface.co/models?filter=data2vec-text
 ]
 
 
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 21ee53a824b0d..6e2c9d1d679c5 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -69,6 +69,7 @@
     from test_module.custom_modeling import CustomModel, NoSuperInitModel
     from transformers import (
         BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        MODEL_FOR_AUDIO_XVECTOR_MAPPING,
         MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
         MODEL_FOR_CAUSAL_LM_MAPPING,
         MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
@@ -137,6 +138,8 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 else v
                 for k, v in inputs_dict.items()
             }
+        elif model_class in get_values(MODEL_FOR_AUDIO_XVECTOR_MAPPING):
+            inputs_dict.pop("attention_mask")
 
         if return_labels:
             if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
@@ -171,6 +174,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 inputs_dict["bool_masked_pos"] = torch.zeros(
                     (self.model_tester.batch_size, num_patches**2), dtype=torch.long, device=torch_device
                 )
+
         return inputs_dict
 
     def test_save_load(self):
@@ -371,7 +375,6 @@ def test_initialization(self):
 
     def test_determinism(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
         for model_class in self.all_model_classes:
             model = model_class(config)
             model.to(torch_device)
diff --git a/tests/test_modeling_data2vec.py b/tests/test_modeling_data2vec.py
index 15ceb29ff9e0f..01dda64c5a1b1 100644
--- a/tests/test_modeling_data2vec.py
+++ b/tests/test_modeling_data2vec.py
@@ -52,19 +52,14 @@
         Data2VecTextForSequenceClassification,
         Data2VecTextForTokenClassification,
         Data2VecTextModel,
-        Wav2Vec2FeatureExtractor,
         Wav2Vec2Processor,
     )
+    from transformers.models.data2vec.modeling_data2vec_audio import _compute_mask_indices
     from transformers.models.data2vec.modeling_data2vec_text import (
         DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
         Data2VecTextForTextEmbeddings,
         create_position_ids_from_input_ids,
     )
-    from transformers.models.wav2vec2.modeling_wav2vec2 import (
-        Wav2Vec2GumbelVectorQuantizer,
-        _compute_mask_indices,
-        _sample_negative_indices,
-    )
 
 
 class Data2VecTextModelTester:
@@ -502,7 +497,7 @@ def test_create_position_ids_from_inputs_embeds(self):
 class Data2VecTextModelIntegrationTest(TestCasePlus):
     @slow
     def test_inference_masked_lm(self):
-        model = Data2VecTextForMaskedLM.from_pretrained("data2vec")
+        model = Data2VecTextForMaskedLM.from_pretrained("facebook/data2vec-text-base")
 
         input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         with torch.no_grad():
@@ -510,51 +505,24 @@ def test_inference_masked_lm(self):
         expected_shape = torch.Size((1, 11, 50265))
         self.assertEqual(output.shape, expected_shape)
         # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
-        )
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
-        # data2vec.eval()
-        # expected_slice = data2vec.model.forward(input_ids)[0][:, :3, :3].detach()
+        expected_slice = torch.tensor([[[0.2328, 0.0000, 1.1710], [2.2525, 0.0000, 1.9937], [2.1280, 0.0000, 1.8691]]])
 
         self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
 
     @slow
     def test_inference_no_head(self):
-        model = Data2VecTextModel.from_pretrained("data2vec")
+        model = Data2VecTextModel.from_pretrained("facebook/data2vec-text-base")
 
         input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         with torch.no_grad():
             output = model(input_ids)[0]
         # compare the actual values for a slice.
         expected_slice = torch.tensor(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
+            [[[0.1998, -0.0379, 0.0024], [-0.0971, -0.2214, -0.1798], [-0.0789, -0.2400, -0.1898]]]
         )
 
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
-        # data2vec.eval()
-        # expected_slice = data2vec.extract_features(input_ids)[:, :3, :3].detach()
-
         self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
 
-    @slow
-    def test_inference_classification_head(self):
-        model = Data2VecTextForSequenceClassification.from_pretrained("data2vec-large-mnli")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 3))
-        self.assertEqual(output.shape, expected_shape)
-        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.large.mnli')
-        # data2vec.eval()
-        # expected_tensor = data2vec.predict("mnli", input_ids, return_logits=True).detach()
-
-        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
-
 
 class Data2VecAudioModelTester:
     def __init__(
@@ -874,7 +842,13 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class Data2VecAudioModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (Data2VecAudioForCTC, Data2VecAudioModel, Data2VecAudioForSequenceClassification)
+        (
+            Data2VecAudioForCTC,
+            Data2VecAudioModel,
+            Data2VecAudioForSequenceClassification,
+            Data2VecAudioForAudioFrameClassification,
+            Data2VecAudioForXVector,
+        )
         if is_torch_available()
         else ()
     )
@@ -1044,7 +1018,7 @@ def _mock_init_weights(self, module):
 
     def test_mask_feature_prob_ctc(self):
         model = Data2VecAudioForCTC.from_pretrained(
-            "patrickvonplaten/data2vec-base-960h", mask_feature_prob=0.2, mask_feature_length=2
+            "facebook/data2vec-audio-base-960h", mask_feature_prob=0.2, mask_feature_length=2
         )
         model.to(torch_device).train()
         processor = Wav2Vec2Processor.from_pretrained(
@@ -1067,7 +1041,7 @@ def test_mask_feature_prob_ctc(self):
 
     def test_mask_time_prob_ctc(self):
         model = Data2VecAudioForCTC.from_pretrained(
-            "patrickvonplaten/data2vec-base-960h", mask_time_prob=0.2, mask_time_length=2
+            "facebook/data2vec-audio-base-960h", mask_time_prob=0.2, mask_time_length=2
         )
         model.to(torch_device).train()
         processor = Wav2Vec2Processor.from_pretrained(
@@ -1094,7 +1068,7 @@ def test_feed_forward_chunking(self):
 
     @slow
     def test_model_from_pretrained(self):
-        model = Data2VecAudioModel.from_pretrained("patrickvonplaten/data2vec-base")
+        model = Data2VecAudioModel.from_pretrained("facebook/data2vec-audio-base")
         self.assertIsNotNone(model)
 
 
@@ -1190,81 +1164,6 @@ def test_compute_mask_indices_short_audio(self):
         # make sure that non-padded examples cannot be padded
         self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any())
 
-    def test_compute_perplexity(self):
-        probs = torch.arange(100, device=torch_device).reshape(2, 5, 10) / 100
-
-        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs)
-        self.assertTrue(abs(ppl.item() - 141.4291) < 1e-3)
-
-        # mask half of the input
-        mask = torch.ones((2,), device=torch_device, dtype=torch.bool)
-        mask[0] = 0
-
-        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs, mask)
-        self.assertTrue(abs(ppl.item() - 58.6757) < 1e-3)
-
-    def test_sample_negatives(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
-            sequence_length, hidden_size
-        )  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None)
-        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
-    def test_sample_negatives_with_mask(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        # second half of last input tensor is padded
-        mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
-        mask[-1, sequence_length // 2 :] = 0
-
-        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
-            sequence_length, hidden_size
-        )  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
-
-        # replace masked feature vectors with -100 to test that those are not sampled
-        features = torch.where(mask[:, :, None].expand(features.shape).bool(), features, -100)
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices(
-            (batch_size, sequence_length), num_negatives, mask.cpu().numpy()
-        )
-        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-
-        self.assertTrue((negatives >= 0).all().item())
-
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
 
 @require_torch
 @require_soundfile
@@ -1285,7 +1184,7 @@ def _load_superb(self, task, num_samples):
         return ds[:num_samples]
 
     def test_inference_ctc_normal(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
+        model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h")
         model.to(torch_device)
         processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
         input_speech = self._load_datasamples(1)
@@ -1301,31 +1200,8 @@ def test_inference_ctc_normal(self):
         EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
         self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
 
-    def test_inference_ctc_normal_batched(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
-        model.to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_robust_batched(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
+    def test_inference_ctc_batched(self):
+        model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h").to(torch_device)
         processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
 
         input_speech = self._load_datasamples(4)
@@ -1333,10 +1209,9 @@ def test_inference_ctc_robust_batched(self):
         inputs = processor(input_speech, return_tensors="pt", padding=True)
 
         input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
 
         with torch.no_grad():
-            logits = model(input_values, attention_mask=attention_mask).logits
+            logits = model(input_values).logits
 
         predicted_ids = torch.argmax(logits, dim=-1)
         predicted_trans = processor.batch_decode(predicted_ids)
@@ -1344,191 +1219,7 @@ def test_inference_ctc_robust_batched(self):
         EXPECTED_TRANSCRIPTIONS = [
             "a man said to the universe sir i exist",
             "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
-            "his instant panic was followed by a small sharp blow high on his chest",
+            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with thousands of spectators were trivialities not worth thinking about",
+            "his instant of panic was followed by a small sharp blow high on his chest",
         ]
         self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_keyword_spotting(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
-        input_data = self._load_superb("ks", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
-
-        expected_labels = [7, 6, 10, 9]
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor([6.1186, 11.8961, 10.2931, 6.0898], device=torch_device)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_inference_intent_classification(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("ic", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-
-        predicted_logits_action, predicted_ids_action = torch.max(outputs.logits[:, :6], dim=-1)
-        predicted_logits_object, predicted_ids_object = torch.max(outputs.logits[:, 6:20], dim=-1)
-        predicted_logits_location, predicted_ids_location = torch.max(outputs.logits[:, 20:24], dim=-1)
-
-        expected_labels_action = [0, 0, 2, 3]
-        expected_logits_action = torch.tensor([0.4568, 11.0848, 1.6621, 9.3841], device=torch_device)
-        expected_labels_object = [3, 10, 3, 4]
-        expected_logits_object = torch.tensor([1.5322, 10.7094, 5.2469, 22.1318], device=torch_device)
-        expected_labels_location = [0, 0, 0, 1]
-        expected_logits_location = torch.tensor([1.5335, 6.5096, 10.5704, 11.0569], device=torch_device)
-
-        self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
-        self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
-        self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)
-
-        self.assertTrue(torch.allclose(predicted_logits_action, expected_logits_action, atol=1e-2))
-        self.assertTrue(torch.allclose(predicted_logits_object, expected_logits_object, atol=1e-2))
-        self.assertTrue(torch.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
-
-    def test_inference_speaker_identification(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("si", 4)
-
-        output_logits = []
-        with torch.no_grad():
-            for example in input_data["speech"]:
-                input = processor(example, return_tensors="pt", padding=True)
-                output = model(input.input_values.to(torch_device), attention_mask=None)
-                output_logits.append(output.logits[0])
-        output_logits = torch.stack(output_logits)
-        predicted_logits, predicted_ids = torch.max(output_logits, dim=-1)
-
-        expected_labels = [251, 1, 1, 3]
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor([37.5627, 71.6362, 64.2419, 31.7778], device=torch_device)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_inference_emotion_recognition(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("er", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
-
-        expected_labels = [1, 1, 2, 2]
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor([2.1722, 3.0779, 8.0287, 6.6797], device=torch_device)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_phoneme_recognition(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("superb/wav2vec2-base-superb-ic")
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values, attention_mask=attention_mask).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "ɐ m æ n s ɛ d t ə ð ə j uː n ɪ v ɚ s s ɚ aɪ ɛ ɡ z ɪ s t",
-            "s w ɛ t k ʌ v ɚ d b ɹ iː ɔ n z b ɑː d i t ɹ ɪ k l ɪ ŋ ɪ n t ə ð ə t aɪ t l oɪ n k l ɑː θ ð æ w ʌ z ð ɪ oʊ n l i ɡ ɑːɹ m ə n t h iː w ɔːɹ",
-            "ð ə k aɪ t ɔ n h ɪ z tʃ ɛ s t s t ɪ l d ɹ ɪ p ɪ ŋ b l ʌ d ð ɪ eɪ k ʌ v h ɪ z oʊ v ɚ s t ɹ eɪ n d aɪ z iː v ə n ð ə s ɔːɹ ɹ ɪ ŋ ɐ ɹ iː n ɐ ɚ ɹ aʊ n d h ɪ m w ɪ ð ə θ aʊ z ə n d z ʌ v s p ɛ k t eɪ ɾ ɚ z w ɜː t ɹ ɪ v ɪ æ l ᵻ ɾ i z n ɑː t w ɜː θ θ ɪ ŋ k ɪ ŋ ɐ b aʊ t",
-            "h ɪ z ɪ n s t ə n t v p æ n ɪ k w ʌ z f ɑː l oʊ d b aɪ ɐ s m ɔː l ʃ ɑːɹ p b l oʊ h aɪ ɔ n h ɪ z tʃ ɛ s t",
-        ]
-        # should correspond to =>:
-        # [
-        # "a man said to the universe sir i exist",
-        # "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-        # "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
-        # "his instant panic was followed by a small sharp blow high on his chest",
-        # ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_diarization(self):
-        model = Data2VecAudioForAudioFrameClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("sd", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        # labels is a one-hot array of shape (num_frames, num_speakers)
-        labels = (outputs.logits > 0).long()
-
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor(
-            [
-                [[-5.2807, -5.1272], [-5.4059, -4.7757], [-5.2764, -4.9621], [-5.0117, -4.5851]],
-                [[-1.7643, -0.5462], [-1.7369, -0.2649], [-1.5066, -0.6200], [-4.5703, -2.4863]],
-                [[-0.8656, -0.4783], [-0.8899, -0.3289], [-0.9267, -0.5781], [-0.7817, -0.4619]],
-                [[-4.8625, -2.5316], [-5.2339, -2.2155], [-4.9835, -2.0344], [-4.4727, -1.8421]],
-            ],
-            device=torch_device,
-        )
-        self.assertEqual(labels[0, :, 0].sum(), 555)
-        self.assertEqual(labels[0, :, 1].sum(), 299)
-        # TODO: update the tolerance after the CI moves to torch 1.10
-        self.assertTrue(torch.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2))
-
-    def test_inference_speaker_verification(self):
-        model = Data2VecAudioForXVector.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("si", 4)
-
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
-        labels = torch.tensor([5, 1, 1, 3], device=torch_device).T
-
-        with torch.no_grad():
-            input_values = inputs.input_values.to(torch_device)
-            attention_mask = inputs.attention_mask.to(torch_device)
-            outputs = model(input_values, attention_mask=attention_mask, labels=labels)
-        embeddings = torch.nn.functional.normalize(outputs.embeddings, dim=-1).cpu()
-
-        cosine_sim = torch.nn.CosineSimilarity(dim=-1)
-        # id10002 vs id10002
-        self.assertAlmostEqual(cosine_sim(embeddings[1], embeddings[2]).numpy(), 0.9758, 3)
-        # id10006 vs id10002
-        self.assertAlmostEqual(cosine_sim(embeddings[0], embeddings[1]).numpy(), 0.7579, 3)
-        # id10002 vs id10004
-        self.assertAlmostEqual(cosine_sim(embeddings[2], embeddings[3]).numpy(), 0.7594, 3)
-
-        # TODO: update the tolerance after the CI moves to torch 1.10
-        self.assertAlmostEqual(outputs.loss.item(), 17.7963, 2)

From 2b958d232b3a4ddb8a5fc6c8e22fb0bd66e81648 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 17:12:01 +0000
Subject: [PATCH 052/121] finish

---
 docs/source/index.mdx                      |   3 +-
 docs/source/serialization.mdx              |   2 +-
 src/transformers/__init__.py               |   6 +-
 src/transformers/utils/dummy_pt_objects.py |   8 +-
 tests/test_modeling_data2vec_audio.py      | 743 +++++++++++++++++++++
 tests/test_modeling_data2vec_text.py       | 506 ++++++++++++++
 utils/check_repo.py                        |  12 +
 7 files changed, 1275 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_modeling_data2vec_audio.py
 create mode 100644 tests/test_modeling_data2vec_text.py

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index c4a42a4de4192..44a53a6e93774 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -181,7 +181,8 @@ Flax), PyTorch, and/or TensorFlow.
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNext           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          Data2Vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/serialization.mdx b/docs/source/serialization.mdx
index c77a77114e5f8..5eff7580d9b40 100644
--- a/docs/source/serialization.mdx
+++ b/docs/source/serialization.mdx
@@ -49,7 +49,7 @@ Ready-made configurations include the following architectures:
 - BART
 - BERT
 - CamemBERT
-- Data2Vec
+- Data2VecText
 - DistilBERT
 - ELECTRA
 - GPT Neo
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ea6846d042584..de985bc090c76 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -874,7 +874,8 @@
     )
     _import_structure["models.data2vec"].extend(
         [
-            "DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
             "Data2VecAudioForAudioFrameClassification",
             "Data2VecAudioForCTC",
             "Data2VecAudioForSequenceClassification",
@@ -3035,7 +3036,8 @@
             CTRLPreTrainedModel,
         )
         from .models.data2vec import (
-            DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
             Data2VecAudioForAudioFrameClassification,
             Data2VecAudioForCTC,
             Data2VecAudioForSequenceClassification,
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index c542812bed23e..ba8771915af64 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -347,6 +347,9 @@ def load_tf_weights_in_albert(*args, **kwargs):
 MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = None
 
 
+MODEL_FOR_AUDIO_XVECTOR_MAPPING = None
+
+
 MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING = None
 
 
@@ -1208,7 +1211,10 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST = None
+DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
 class Data2VecAudioForAudioFrameClassification(metaclass=DummyObject):
diff --git a/tests/test_modeling_data2vec_audio.py b/tests/test_modeling_data2vec_audio.py
new file mode 100644
index 0000000000000..4797d5365bc21
--- /dev/null
+++ b/tests/test_modeling_data2vec_audio.py
@@ -0,0 +1,743 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Data2VecAudio model. """
+
+import math
+import unittest
+
+import numpy as np
+from datasets import load_dataset
+
+from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
+from transformers import Data2VecAudioConfig, is_torch_available
+from transformers.testing_utils import is_pt_flax_cross_test, require_soundfile, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, _config_zero_init
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Data2VecAudioForAudioFrameClassification,
+        Data2VecAudioForCTC,
+        Data2VecAudioForSequenceClassification,
+        Data2VecAudioForXVector,
+        Data2VecAudioModel,
+        Wav2Vec2Processor,
+    )
+    from transformers.models.data2vec.modeling_data2vec_audio import _compute_mask_indices
+
+
+class Data2VecAudioModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=1024,  # speech is longer
+        is_training=False,
+        hidden_size=16,
+        feat_extract_dropout=0.0,
+        feat_extract_activation="gelu",
+        conv_dim=(32, 32, 32),
+        conv_stride=(4, 4, 4),
+        conv_kernel=(8, 8, 8),
+        conv_bias=False,
+        num_conv_pos_embeddings=16,
+        num_conv_pos_embedding_groups=2,
+        num_hidden_layers=4,
+        num_attention_heads=2,
+        hidden_dropout_prob=0.1,
+        intermediate_size=20,
+        layer_norm_eps=1e-5,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        mask_time_prob=0.5,
+        mask_time_length=2,
+        vocab_size=32,
+        num_adapter_layers=1,
+        adapter_stride=2,
+        tdnn_dim=(32, 32),
+        tdnn_kernel=(5, 3),
+        tdnn_dilation=(1, 2),
+        xvector_output_dim=32,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.feat_extract_dropout = feat_extract_dropout
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = conv_dim
+        self.conv_stride = conv_stride
+        self.conv_kernel = conv_kernel
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.intermediate_size = intermediate_size
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.num_adapter_layers = num_adapter_layers
+        self.adapter_stride = adapter_stride
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.scope = scope
+        self.tdnn_dim = tdnn_dim
+        self.tdnn_kernel = tdnn_kernel
+        self.tdnn_dilation = tdnn_dilation
+        self.xvector_output_dim = xvector_output_dim
+
+        output_seq_length = self.seq_length
+        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
+            output_seq_length = (output_seq_length - (kernel - 1)) / stride
+        self.output_seq_length = int(math.ceil(output_seq_length))
+        self.encoder_seq_length = self.output_seq_length
+
+        self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1
+
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+
+        return config, input_values, attention_mask
+
+    def get_config(self):
+        return Data2VecAudioConfig(
+            hidden_size=self.hidden_size,
+            feat_extract_dropout=self.feat_extract_dropout,
+            feat_extract_activation=self.feat_extract_activation,
+            conv_dim=self.conv_dim,
+            conv_stride=self.conv_stride,
+            conv_kernel=self.conv_kernel,
+            conv_bias=self.conv_bias,
+            mask_time_prob=self.mask_time_prob,
+            mask_time_length=self.mask_time_length,
+            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
+            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            intermediate_size=self.intermediate_size,
+            layer_norm_eps=self.layer_norm_eps,
+            hidden_act=self.hidden_act,
+            initializer_range=self.initializer_range,
+            vocab_size=self.vocab_size,
+            num_adapter_layers=self.num_adapter_layers,
+            adapter_stride=self.adapter_stride,
+            tdnn_dim=self.tdnn_dim,
+            tdnn_kernel=self.tdnn_kernel,
+            tdnn_dilation=self.tdnn_dilation,
+            xvector_output_dim=self.xvector_output_dim,
+        )
+
+    def create_and_check_model(self, config, input_values, attention_mask):
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
+        config.add_adapter = True
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask):
+        config.add_adapter = True
+        config.output_hidden_size = 8
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
+        )
+
+    def create_and_check_batch_inference(self, config, input_values, *args):
+        # test does not pass for models making use of `group_norm`
+        # check: https://github.com/pytorch/fairseq/issues/3227
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0.0
+
+        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
+
+        for i in range(input_values.shape[0]):
+            input_slice = input_values[i : i + 1, : input_lengths[i]]
+            output = model(input_slice).last_hidden_state
+
+            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
+            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
+
+    def check_ctc_loss(self, config, input_values, *args):
+        model = Data2VecAudioForCTC(config=config)
+        model.to(torch_device)
+
+        # make sure that dropout is disabled
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0
+
+        model.config.ctc_loss_reduction = "sum"
+        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+
+        model.config.ctc_loss_reduction = "mean"
+        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+
+        self.parent.assertTrue(isinstance(sum_loss, float))
+        self.parent.assertTrue(isinstance(mean_loss, float))
+
+    def check_seq_classifier_loss(self, config, input_values, *args):
+        model = Data2VecAudioForSequenceClassification(config=config)
+        model.to(torch_device)
+
+        # make sure that dropout is disabled
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0
+
+        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+        unmasked_loss = model(input_values, labels=labels).loss.item()
+
+        self.parent.assertTrue(isinstance(masked_loss, float))
+        self.parent.assertTrue(isinstance(unmasked_loss, float))
+        self.parent.assertTrue(masked_loss != unmasked_loss)
+
+    def check_ctc_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForCTC(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze feature encoder
+        model.freeze_feature_encoder()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+            if max_length_labels[i] < labels.shape[-1]:
+                # it's important that we make sure that target lenghts are at least
+                # one shorter than logit lenghts to prevent -inf
+                labels[i, max_length_labels[i] - 1 :] = -100
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_seq_classifier_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForSequenceClassification(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze everything but the classification head
+        model.freeze_base_model()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_xvector_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForXVector(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze everything but the classification head
+        model.freeze_base_model()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_labels_out_of_vocab(self, config, input_values, *args):
+        model = Data2VecAudioForCTC(config)
+        model.to(torch_device)
+        model.train()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)
+
+        with self.parent.assertRaises(ValueError):
+            model(input_values, labels=labels)
+
+    def prepare_config_and_inputs_for_common(self):
+        config, input_values, attention_mask = self.prepare_config_and_inputs()
+        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Data2VecAudioModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            Data2VecAudioForCTC,
+            Data2VecAudioModel,
+            Data2VecAudioForSequenceClassification,
+            Data2VecAudioForAudioFrameClassification,
+            Data2VecAudioForXVector,
+        )
+        if is_torch_available()
+        else ()
+    )
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = Data2VecAudioModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_adapter(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
+
+    def test_model_with_adapter_proj_dim(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
+
+    def test_ctc_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_loss(*config_and_inputs)
+
+    def test_seq_classifier_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
+
+    def test_ctc_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_training(*config_and_inputs)
+
+    def test_seq_classifier_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_training(*config_and_inputs)
+
+    def test_xvector_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_xvector_training(*config_and_inputs)
+
+    def test_labels_out_of_vocab(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
+
+    # Data2VecAudio has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # `input_ids` is renamed to `input_values`
+    def test_forward_signature(self):
+        pass
+
+    # Data2VecAudio cannot resize token embeddings
+    # since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # Data2VecAudio has no inputs_embeds
+    # and thus the `get_input_embeddings` fn
+    # is not implemented
+    def test_model_common_attributes(self):
+        pass
+
+    @is_pt_flax_cross_test
+    # non-robust architecture does not exist in Flax
+    def test_equivalence_flax_to_pt(self):
+        pass
+
+    @is_pt_flax_cross_test
+    # non-robust architecture does not exist in Flax
+    def test_equivalence_pt_to_flax(self):
+        pass
+
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        # set layer drop to 0
+        model.config.layerdrop = 0.0
+
+        input_values = inputs_dict["input_values"]
+
+        input_lengths = torch.tensor(
+            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
+        )
+        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
+
+        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
+        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
+        inputs_dict["labels"] = labels
+
+        outputs = model(**inputs_dict)
+
+        output = outputs[0]
+
+        # Encoder-/Decoder-only models
+        hidden_states = outputs.hidden_states[0]
+        attentions = outputs.attentions[0]
+
+        hidden_states.retain_grad()
+        attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+        self.assertIsNotNone(attentions.grad)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = [
+                    "conv.weight",
+                    "masked_spec_embed",
+                    "codevectors",
+                    "quantizer.weight_proj.weight",
+                    "project_hid.weight",
+                    "project_hid.bias",
+                    "project_q.weight",
+                    "project_q.bias",
+                    "feature_projection.projection.weight",
+                    "feature_projection.projection.bias",
+                    "objective.weight",
+                ]
+                if param.requires_grad:
+                    if any([x in name for x in uniform_init_parms]):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.data.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+        if hasattr(module, "codevectors") and module.codevectors is not None:
+            module.codevectors.data.fill_(3)
+        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
+            module.masked_spec_embed.data.fill_(3)
+
+    def test_mask_feature_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "facebook/data2vec-audio-base-960h", mask_feature_prob=0.2, mask_feature_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    def test_mask_time_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "facebook/data2vec-audio-base-960h", mask_time_prob=0.2, mask_time_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    @unittest.skip(reason="Feed forward chunking is not implemented")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = Data2VecAudioModel.from_pretrained("facebook/data2vec-audio-base")
+        self.assertIsNotNone(model)
+
+
+@require_torch
+class Data2VecAudioUtilsTest(unittest.TestCase):
+    def test_compute_mask_indices(self):
+        batch_size = 4
+        sequence_length = 60
+        mask_prob = 0.5
+        mask_length = 1
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
+
+    def test_compute_mask_indices_low_prob(self):
+        # with these settings num_masked_spans=0.5, which means probabilistic rounding
+        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
+        # the other 5 out of 10, cases num_masked_spans=1
+        n_trials = 100
+        batch_size = 4
+        sequence_length = 100
+        mask_prob = 0.05
+        mask_length = 10
+
+        count_dimensions_masked = 0
+        count_dimensions_not_masked = 0
+
+        for _ in range(n_trials):
+            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+            mask = torch.from_numpy(mask).to(torch_device)
+
+            num_masks = torch.sum(mask).item()
+
+            if num_masks > 0:
+                count_dimensions_masked += 1
+            else:
+                count_dimensions_not_masked += 1
+
+        # as we test for at least 10 masked dimension and at least
+        # 10 non-masked dimension, this test could fail with probability:
+        # P(100 coin flips, at most 9 heads) = 1.66e-18
+        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
+        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
+
+    def test_compute_mask_indices_overlap(self):
+        batch_size = 4
+        sequence_length = 80
+        mask_prob = 0.5
+        mask_length = 4
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
+        for batch_sum in mask.sum(axis=-1):
+            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
+
+    def test_compute_mask_indices_attn_mask_overlap(self):
+        batch_size = 4
+        sequence_length = 80
+        mask_prob = 0.5
+        mask_length = 4
+
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        attention_mask[:2, sequence_length // 2 :] = 0
+
+        mask = _compute_mask_indices(
+            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
+        )
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        for batch_sum in mask.sum(axis=-1):
+            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
+
+        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
+
+    def test_compute_mask_indices_short_audio(self):
+        batch_size = 4
+        sequence_length = 100
+        mask_prob = 0.05
+        mask_length = 10
+
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        # force one example to be heavily padded
+        attention_mask[0, 5:] = 0
+
+        mask = _compute_mask_indices(
+            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
+        )
+
+        # make sure that non-padded examples cannot be padded
+        self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any())
+
+
+@require_torch
+@require_soundfile
+@slow
+class Data2VecAudioModelIntegrationTest(unittest.TestCase):
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").filter(
+            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
+        )[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def _load_superb(self, task, num_samples):
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+
+        return ds[:num_samples]
+
+    def test_inference_ctc_normal(self):
+        model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h")
+        model.to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+        input_speech = self._load_datasamples(1)
+
+        input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_ctc_batched(self):
+        model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h").to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+
+        input_speech = self._load_datasamples(4)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "a man said to the universe sir i exist",
+            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
+            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with thousands of spectators were trivialities not worth thinking about",
+            "his instant of panic was followed by a small sharp blow high on his chest",
+        ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
diff --git a/tests/test_modeling_data2vec_text.py b/tests/test_modeling_data2vec_text.py
new file mode 100644
index 0000000000000..45dba500601b4
--- /dev/null
+++ b/tests/test_modeling_data2vec_text.py
@@ -0,0 +1,506 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Data2VecAudio model. """
+
+import unittest
+
+from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
+from transformers import Data2VecTextConfig, is_torch_available
+from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Data2VecTextForCausalLM,
+        Data2VecTextForMaskedLM,
+        Data2VecTextForMultipleChoice,
+        Data2VecTextForQuestionAnswering,
+        Data2VecTextForSequenceClassification,
+        Data2VecTextForTokenClassification,
+        Data2VecTextModel,
+    )
+    from transformers.models.data2vec.modeling_data2vec_text import (
+        DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        Data2VecTextForTextEmbeddings,
+        create_position_ids_from_input_ids,
+    )
+
+
+class Data2VecTextModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return Data2VecTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = Data2VecTextModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = Data2VecTextForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = Data2VecTextForCausalLM(config=config).to(torch_device).eval()
+
+        # make sure that ids don't start with pad token
+        mask = input_ids.ne(config.pad_token_id).long()
+        input_ids = input_ids * mask
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+
+        # make sure that ids don't start with pad token
+        mask = next_tokens.ne(config.pad_token_id).long()
+        next_tokens = next_tokens * mask
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = Data2VecTextForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = Data2VecTextForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Data2VecTextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            Data2VecTextForCausalLM,
+            Data2VecTextForMaskedLM,
+            Data2VecTextModel,
+            Data2VecTextForSequenceClassification,
+            Data2VecTextForTokenClassification,
+            Data2VecTextForMultipleChoice,
+            Data2VecTextForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (Data2VecTextForCausalLM,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = Data2VecTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Data2VecTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = Data2VecTextModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_create_position_ids_respects_padding_index(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = Data2VecTextForTextEmbeddings(config=config)
+
+        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
+        expected_positions = torch.as_tensor(
+            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
+        )
+
+        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+    def test_create_position_ids_from_inputs_embeds(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        embeddings = Data2VecTextForTextEmbeddings(config=config)
+
+        inputs_embeds = torch.empty(2, 4, 30)
+        expected_single_positions = [
+            0 + embeddings.padding_idx + 1,
+            1 + embeddings.padding_idx + 1,
+            2 + embeddings.padding_idx + 1,
+            3 + embeddings.padding_idx + 1,
+        ]
+        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
+        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+
+@require_torch
+class Data2VecTextModelIntegrationTest(TestCasePlus):
+    @slow
+    def test_inference_masked_lm(self):
+        model = Data2VecTextForMaskedLM.from_pretrained("facebook/data2vec-text-base")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 50265))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor([[[0.2328, 0.0000, 1.1710], [2.2525, 0.0000, 1.9937], [2.1280, 0.0000, 1.8691]]])
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_no_head(self):
+        model = Data2VecTextModel.from_pretrained("facebook/data2vec-text-base")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[0.1998, -0.0379, 0.0024], [-0.0971, -0.2214, -0.1798], [-0.0789, -0.2400, -0.1898]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 41ba2fab6763f..765631d559e1d 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -18,6 +18,7 @@
 import os
 import re
 import warnings
+from collections import OrderedDict
 from difflib import get_close_matches
 from pathlib import Path
 
@@ -171,6 +172,16 @@
     "Data2VecAudioModel",  # TODO: Fix
 ]
 
+# Update this list for models that have multiple model types for the same
+# model doc
+MODEL_TYPE_TO_DOC_MAPPING = OrderedDict(
+    [
+        ("data2vec-text", "data2vec"),
+        ("data2vec-audio", "data2vec"),
+    ]
+)
+
+
 # This is to make sure the transformers module imported is the one in the repo.
 spec = importlib.util.spec_from_file_location(
     "transformers",
@@ -629,6 +640,7 @@ def check_model_type_doc_match():
     model_docs = [m.stem for m in model_doc_folder.glob("*.mdx")]
 
     model_types = list(transformers.models.auto.configuration_auto.MODEL_NAMES_MAPPING.keys())
+    model_types = [MODEL_TYPE_TO_DOC_MAPPING[m] if m in MODEL_TYPE_TO_DOC_MAPPING else m for m in model_types]
 
     errors = []
     for m in model_docs:

From 89d8f9b6af91fb7e1d409e8dbadbc4c9830ccfc1 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 17:24:58 +0000
Subject: [PATCH 053/121] remove useless test file

---
 tests/test_modeling_data2vec.py | 1225 -------------------------------
 1 file changed, 1225 deletions(-)
 delete mode 100644 tests/test_modeling_data2vec.py

diff --git a/tests/test_modeling_data2vec.py b/tests/test_modeling_data2vec.py
deleted file mode 100644
index 01dda64c5a1b1..0000000000000
--- a/tests/test_modeling_data2vec.py
+++ /dev/null
@@ -1,1225 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Data2VecAudio model. """
-
-import math
-import unittest
-
-import numpy as np
-from datasets import load_dataset
-
-from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
-from transformers import Data2VecAudioConfig, Data2VecTextConfig, is_torch_available
-from transformers.testing_utils import (
-    TestCasePlus,
-    is_pt_flax_cross_test,
-    require_soundfile,
-    require_torch,
-    slow,
-    torch_device,
-)
-
-from .test_configuration_common import ConfigTester
-from .test_generation_utils import GenerationTesterMixin
-from .test_modeling_common import ModelTesterMixin, _config_zero_init
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        Data2VecAudioForAudioFrameClassification,
-        Data2VecAudioForCTC,
-        Data2VecAudioForSequenceClassification,
-        Data2VecAudioForXVector,
-        Data2VecAudioModel,
-        Data2VecTextForCausalLM,
-        Data2VecTextForMaskedLM,
-        Data2VecTextForMultipleChoice,
-        Data2VecTextForQuestionAnswering,
-        Data2VecTextForSequenceClassification,
-        Data2VecTextForTokenClassification,
-        Data2VecTextModel,
-        Wav2Vec2Processor,
-    )
-    from transformers.models.data2vec.modeling_data2vec_audio import _compute_mask_indices
-    from transformers.models.data2vec.modeling_data2vec_text import (
-        DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        Data2VecTextForTextEmbeddings,
-        create_position_ids_from_input_ids,
-    )
-
-
-class Data2VecTextModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 5
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return Data2VecTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecTextModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Data2VecTextModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Data2VecTextForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Data2VecTextForCausalLM(config=config).to(torch_device).eval()
-
-        # make sure that ids don't start with pad token
-        mask = input_ids.ne(config.pad_token_id).long()
-        input_ids = input_ids * mask
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-
-        # make sure that ids don't start with pad token
-        mask = next_tokens.ne(config.pad_token_id).long()
-        next_tokens = next_tokens * mask
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecTextForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = Data2VecTextForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = Data2VecTextForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecTextForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class Data2VecTextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            Data2VecTextForCausalLM,
-            Data2VecTextForMaskedLM,
-            Data2VecTextModel,
-            Data2VecTextForSequenceClassification,
-            Data2VecTextForTokenClassification,
-            Data2VecTextForMultipleChoice,
-            Data2VecTextForQuestionAnswering,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (Data2VecTextForCausalLM,) if is_torch_available() else ()
-
-    def setUp(self):
-        self.model_tester = Data2VecTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Data2VecTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = Data2VecTextModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    def test_create_position_ids_respects_padding_index(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = Data2VecTextForTextEmbeddings(config=config)
-
-        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = torch.as_tensor(
-            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
-        )
-
-        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-    def test_create_position_ids_from_inputs_embeds(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = Data2VecTextForTextEmbeddings(config=config)
-
-        inputs_embeds = torch.empty(2, 4, 30)
-        expected_single_positions = [
-            0 + embeddings.padding_idx + 1,
-            1 + embeddings.padding_idx + 1,
-            2 + embeddings.padding_idx + 1,
-            3 + embeddings.padding_idx + 1,
-        ]
-        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
-        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-
-@require_torch
-class Data2VecTextModelIntegrationTest(TestCasePlus):
-    @slow
-    def test_inference_masked_lm(self):
-        model = Data2VecTextForMaskedLM.from_pretrained("facebook/data2vec-text-base")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor([[[0.2328, 0.0000, 1.1710], [2.2525, 0.0000, 1.9937], [2.1280, 0.0000, 1.8691]]])
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_no_head(self):
-        model = Data2VecTextModel.from_pretrained("facebook/data2vec-text-base")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[0.1998, -0.0379, 0.0024], [-0.0971, -0.2214, -0.1798], [-0.0789, -0.2400, -0.1898]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-
-class Data2VecAudioModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=1024,  # speech is longer
-        is_training=False,
-        hidden_size=16,
-        feat_extract_dropout=0.0,
-        feat_extract_activation="gelu",
-        conv_dim=(32, 32, 32),
-        conv_stride=(4, 4, 4),
-        conv_kernel=(8, 8, 8),
-        conv_bias=False,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
-        num_attention_heads=2,
-        hidden_dropout_prob=0.1,
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        mask_time_prob=0.5,
-        mask_time_length=2,
-        vocab_size=32,
-        num_adapter_layers=1,
-        adapter_stride=2,
-        tdnn_dim=(32, 32),
-        tdnn_kernel=(5, 3),
-        tdnn_dilation=(1, 2),
-        xvector_output_dim=32,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.num_adapter_layers = num_adapter_layers
-        self.adapter_stride = adapter_stride
-        self.mask_time_prob = mask_time_prob
-        self.mask_time_length = mask_time_length
-        self.scope = scope
-        self.tdnn_dim = tdnn_dim
-        self.tdnn_kernel = tdnn_kernel
-        self.tdnn_dilation = tdnn_dilation
-        self.xvector_output_dim = xvector_output_dim
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-        self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        return config, input_values, attention_mask
-
-    def get_config(self):
-        return Data2VecAudioConfig(
-            hidden_size=self.hidden_size,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            mask_time_prob=self.mask_time_prob,
-            mask_time_length=self.mask_time_length,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            num_adapter_layers=self.num_adapter_layers,
-            adapter_stride=self.adapter_stride,
-            tdnn_dim=self.tdnn_dim,
-            tdnn_kernel=self.tdnn_kernel,
-            tdnn_dilation=self.tdnn_dilation,
-            xvector_output_dim=self.xvector_output_dim,
-        )
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 8
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
-        )
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = Data2VecAudioForCTC(config=config)
-        model.to(torch_device)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_seq_classifier_loss(self, config, input_values, *args):
-        model = Data2VecAudioForSequenceClassification(config=config)
-        model.to(torch_device)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-        unmasked_loss = model(input_values, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(masked_loss, float))
-        self.parent.assertTrue(isinstance(unmasked_loss, float))
-        self.parent.assertTrue(masked_loss != unmasked_loss)
-
-    def check_ctc_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Data2VecAudioForCTC(config=config)
-        model.to(torch_device)
-        model.train()
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_seq_classifier_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Data2VecAudioForSequenceClassification(config=config)
-        model.to(torch_device)
-        model.train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_xvector_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Data2VecAudioForXVector(config=config)
-        model.to(torch_device)
-        model.train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = Data2VecAudioForCTC(config)
-        model.to(torch_device)
-        model.train()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)
-
-        with self.parent.assertRaises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class Data2VecAudioModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            Data2VecAudioForCTC,
-            Data2VecAudioModel,
-            Data2VecAudioForSequenceClassification,
-            Data2VecAudioForAudioFrameClassification,
-            Data2VecAudioForXVector,
-        )
-        if is_torch_available()
-        else ()
-    )
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = Data2VecAudioModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
-
-    def test_model_with_adapter_proj_dim(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_xvector_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_xvector_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # Data2VecAudio has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_values`
-    def test_forward_signature(self):
-        pass
-
-    # Data2VecAudio cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # Data2VecAudio has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
-    def test_model_common_attributes(self):
-        pass
-
-    @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_pt_to_flax(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_values = inputs_dict["input_values"]
-
-        input_lengths = torch.tensor(
-            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    def test_mask_feature_prob_ctc(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "facebook/data2vec-audio-base-960h", mask_feature_prob=0.2, mask_feature_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 299, 32))
-
-    def test_mask_time_prob_ctc(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "facebook/data2vec-audio-base-960h", mask_time_prob=0.2, mask_time_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 299, 32))
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = Data2VecAudioModel.from_pretrained("facebook/data2vec-audio-base")
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class Data2VecAudioUtilsTest(unittest.TestCase):
-    def test_compute_mask_indices(self):
-        batch_size = 4
-        sequence_length = 60
-        mask_prob = 0.5
-        mask_length = 1
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = torch.from_numpy(mask).to(torch_device)
-
-        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
-
-    def test_compute_mask_indices_low_prob(self):
-        # with these settings num_masked_spans=0.5, which means probabilistic rounding
-        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
-        # the other 5 out of 10, cases num_masked_spans=1
-        n_trials = 100
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        count_dimensions_masked = 0
-        count_dimensions_not_masked = 0
-
-        for _ in range(n_trials):
-            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-            mask = torch.from_numpy(mask).to(torch_device)
-
-            num_masks = torch.sum(mask).item()
-
-            if num_masks > 0:
-                count_dimensions_masked += 1
-            else:
-                count_dimensions_not_masked += 1
-
-        # as we test for at least 10 masked dimension and at least
-        # 10 non-masked dimension, this test could fail with probability:
-        # P(100 coin flips, at most 9 heads) = 1.66e-18
-        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
-        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
-
-    def test_compute_mask_indices_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = torch.from_numpy(mask).to(torch_device)
-
-        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-    def test_compute_mask_indices_attn_mask_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
-        attention_mask[:2, sequence_length // 2 :] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
-        )
-        mask = torch.from_numpy(mask).to(torch_device)
-
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
-
-    def test_compute_mask_indices_short_audio(self):
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
-        # force one example to be heavily padded
-        attention_mask[0, 5:] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
-        )
-
-        # make sure that non-padded examples cannot be padded
-        self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any())
-
-
-@require_torch
-@require_soundfile
-@slow
-class Data2VecAudioModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
-
-        return ds[:num_samples]
-
-    def test_inference_ctc_normal(self):
-        model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h")
-        model.to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
-        input_speech = self._load_datasamples(1)
-
-        input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_batched(self):
-        model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h").to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with thousands of spectators were trivialities not worth thinking about",
-            "his instant of panic was followed by a small sharp blow high on his chest",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)

From 86cc898ba2bbc15985a06cab7c7d0edb37600826 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 24 Feb 2022 00:01:46 +0100
Subject: [PATCH 054/121] Update tests/test_modeling_common.py

---
 tests/test_modeling_common.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 6e2c9d1d679c5..889cc95c37429 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1914,7 +1914,6 @@ def test_load_with_mismatched_shapes(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
-            print(model_class)
             if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
                 continue
 

From a2de5957cbdeb0994dcfba2e43a5b12ce7c35cec Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Wed, 23 Feb 2022 15:24:10 -0800
Subject: [PATCH 055/121] Update utils/check_repo.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 utils/check_repo.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/check_repo.py b/utils/check_repo.py
index 765631d559e1d..d1d979334f894 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -169,7 +169,6 @@
     "VisualBertForMultipleChoice",
     "TFWav2Vec2ForCTC",
     "TFHubertForCTC",
-    "Data2VecAudioModel",  # TODO: Fix
 ]
 
 # Update this list for models that have multiple model types for the same

From dafc36dbd39b22d387c966483734daa08cda818c Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Wed, 23 Feb 2022 15:24:54 -0800
Subject: [PATCH 056/121] Update
 src/transformers/models/data2vec/modeling_data2vec_text.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/data2vec/modeling_data2vec_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index ecb867d637617..6e8cd7d82078d 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -961,7 +961,7 @@ def forward(
         >>> from transformers import Data2VecTextTokenizer, Data2VecTextForCausalLM, Data2VecTextConfig
         >>> import torch
 
-        >>> tokenizer = Data2VecTextTokenizer.from_pretrained("data2vec-base")
+        >>> tokenizer = Data2VecTextTokenizer.from_pretrained("facebook/data2vec-text-base")
         >>> config = Data2VecTextConfig.from_pretrained("data2vec-base")
         >>> config.is_decoder = True
         >>> model = Data2VecTextForCausalLM.from_pretrained("data2vec-base", config=config)

From 79994cff6a44a2f94b0a56709c7dbb736b47e6f3 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Thu, 3 Feb 2022 16:12:35 -0500
Subject: [PATCH 057/121] Fix copies

---
 src/transformers/models/data2vec/modeling_data2vec.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 85cb88f510a66..172958898535b 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -62,7 +62,6 @@
 ]
 
 
-
 # Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Data2Vec
 class Data2VecEmbeddings(nn.Module):
     """

From b368a6e2447d21194fea3adb9975fb4a13134c1f Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Thu, 3 Feb 2022 16:21:24 -0500
Subject: [PATCH 058/121] Update docs

---
 src/transformers/models/data2vec/modeling_data2vec.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 172958898535b..dbdfcd447ca55 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -952,13 +952,13 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import Data2VecTokenizer, Data2VecForCausalLM, Data2VecConfig
+        >>> from transformers import RobertaTokenizer, Data2VecForSequenceClassification, Data2VecConfig
         >>> import torch
 
-        >>> tokenizer = Data2VecTokenizer.from_pretrained("data2vec-base")
-        >>> config = Data2VecConfig.from_pretrained("data2vec-base")
-        >>> config.is_decoder = True
-        >>> model = Data2VecForCausalLM.from_pretrained("data2vec-base", config=config)
+        >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
+        >>> config = Data2VecConfig.from_pretrained("edugp/data2vec-nlp-base")
+        >>> model = Data2VecForSequenceClassification.from_pretrained("edugp/data2vec-nlp-base", config=config)
+        >>> # Fine-tune this model
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)

From e2dbdb26ac68a03ebb34b677e1a7f778543b31cc Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 15:30:04 -0500
Subject: [PATCH 059/121] Remove fairseq data2vec_text script and fix format

---
 ..._original_pytorch_checkpoint_to_pytorch.py |   5 +-
 .../models/data2vec/data2vec_text.py          | 519 ------------------
 2 files changed, 4 insertions(+), 520 deletions(-)
 delete mode 100644 src/transformers/models/data2vec/data2vec_text.py

diff --git a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
index 8b8220d35e43f..bc4e41e4e058a 100644
--- a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
@@ -33,6 +33,7 @@
     BertSelfOutput,
 )
 from transformers.utils import logging
+
 from .data2vec_text import Data2VecTextModel
 
 
@@ -53,7 +54,9 @@ def convert_data2vec_checkpoint_to_pytorch(
     Copy/paste/tweak data2vec's weights to our BERT structure.
     """
     data2vec_checkpoint_dir, data2vec_checkpoint_file_name = os.path.split(data2vec_checkpoint_path)
-    data2vec = Data2VecTextModel.from_pretrained(data2vec_checkpoint_dir, checkpoint_file=data2vec_checkpoint_file_name)
+    data2vec = Data2VecTextModel.from_pretrained(
+        data2vec_checkpoint_dir, checkpoint_file=data2vec_checkpoint_file_name
+    )
     data2vec.eval()  # disable dropout
     data2vec_model = data2vec.models[0]
     data2vec_sent_encoder = data2vec_model.encoder.sentence_encoder
diff --git a/src/transformers/models/data2vec/data2vec_text.py b/src/transformers/models/data2vec/data2vec_text.py
deleted file mode 100644
index 5cc1d37e842aa..0000000000000
--- a/src/transformers/models/data2vec/data2vec_text.py
+++ /dev/null
@@ -1,519 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from dataclasses import dataclass, field
-from typing import Optional
-import logging
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from omegaconf import II
-
-from fairseq.dataclass import FairseqDataclass
-from fairseq.dataclass.configs import EMAConfig
-from fairseq.models.ema import EMA
-from fairseq.models import (
-    FairseqEncoder,
-    FairseqEncoderModel,
-    register_model,
-)
-from fairseq.models.roberta.model import RobertaLMHead, RobertaClassificationHead
-from fairseq.models.transformer import TransformerEncoder, TransformerConfig
-from fairseq.modules.transformer_sentence_encoder import init_bert_params
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class Data2VecTextConfig(FairseqDataclass):
-    max_positions: int = II("task.tokens_per_sample")
-
-    head_layers: int = 1
-
-    transformer: TransformerConfig = TransformerConfig()
-
-    load_checkpoint_heads: bool = field(
-        default=False,
-        metadata={"help": "(re-)register and load heads when loading checkpoints"},
-    )
-
-    loss_beta: float = field(
-        default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"}
-    )
-    loss_scale: Optional[float] = field(
-        default=None,
-        metadata={
-            "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"
-        },
-    )
-    average_top_k_layers: int = field(
-        default=8, metadata={"help": "how many layers to average"}
-    )
-
-    layer_norm_target_layer: bool = False
-    instance_norm_target_layer: bool = False
-    batch_norm_target_layer: bool = False
-    instance_norm_targets: bool = False
-    layer_norm_targets: bool = False
-
-    ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"})
-    ema_end_decay: float = field(
-        default=0.9999, metadata={"help": "final ema decay rate"}
-    )
-
-    # when to finish annealing ema decay rate
-    ema_anneal_end_step: int = II("optimization.max_update")
-
-    ema_transformer_layers_only: bool = field(
-        default=True,
-        metadata={"help": "whether to momentum update only the transformer layers"},
-    )
-
-
-def get_annealed_rate(start, end, curr_step, total_steps):
-    r = end - start
-    pct_remaining = 1 - curr_step / total_steps
-    return end - r * pct_remaining
-
-
-@register_model("data2vec_text", dataclass=Data2VecTextConfig)
-class Data2VecTextModel(FairseqEncoderModel):
-    def __init__(self, cfg: Data2VecTextConfig, encoder):
-        super().__init__(encoder)
-        self.cfg = cfg
-
-        # We follow BERT's random weight initialization
-        self.apply(init_bert_params)
-
-        self.classification_heads = nn.ModuleDict()
-
-    @classmethod
-    def build_model(cls, cfg, task):
-        """Build a new model instance."""
-
-        encoder = Data2VecTextEncoder(cfg, task.source_dictionary, task.cfg.data)
-
-        return cls(cfg, encoder)
-
-    def forward(
-        self,
-        src_tokens,
-        target_tokens=None,
-        features_only=False,
-        return_all_hiddens=False,
-        classification_head_name=None,
-        **kwargs,
-    ):
-        if classification_head_name is not None:
-            features_only = True
-
-        res = self.encoder(
-            src_tokens, target_tokens, features_only, return_all_hiddens, **kwargs
-        )
-
-        if isinstance(res, tuple):
-            x, extra = res
-        else:
-            return res
-
-        if classification_head_name is not None:
-            x = self.classification_heads[classification_head_name](x)
-        return x, extra
-
-    def get_normalized_probs(self, net_output, log_probs, sample=None):
-        """Get normalized probabilities (or log probs) from a net's output."""
-        logits = net_output[0].float()
-        if log_probs:
-            return F.log_softmax(logits, dim=-1)
-        else:
-            return F.softmax(logits, dim=-1)
-
-    def register_classification_head(
-        self, name, num_classes=None, inner_dim=None, **kwargs
-    ):
-        """Register a classification head."""
-        if name in self.classification_heads:
-            prev_num_classes = self.classification_heads[name].out_proj.out_features
-            prev_inner_dim = self.classification_heads[name].dense.out_features
-            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
-                logger.warning(
-                    're-registering head "{}" with num_classes {} (prev: {}) '
-                    "and inner_dim {} (prev: {})".format(
-                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
-                    )
-                )
-        self.classification_heads[name] = RobertaClassificationHead(
-            input_dim=self.cfg.transformer.encoder.embed_dim,
-            inner_dim=inner_dim or self.cfg.transformer.encoder.embed_dim,
-            num_classes=num_classes,
-            activation_fn="tanh",
-            pooler_dropout=0,
-        )
-
-    @property
-    def supported_targets(self):
-        return {"self"}
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        prefix = name + "." if name != "" else ""
-
-        # rename decoder -> encoder before upgrading children modules
-        for k in list(state_dict.keys()):
-            if k.startswith(prefix + "decoder"):
-                new_k = prefix + "encoder" + k[len(prefix + "decoder") :]
-                state_dict[new_k] = state_dict[k]
-                del state_dict[k]
-
-        # rename emb_layer_norm -> layernorm_embedding
-        for k in list(state_dict.keys()):
-            if ".emb_layer_norm." in k:
-                new_k = k.replace(".emb_layer_norm.", ".layernorm_embedding.")
-                state_dict[new_k] = state_dict[k]
-                del state_dict[k]
-
-            if self.encoder.regression_head is not None:
-                if ".lm_head." in k:
-                    new_k = k.replace(".lm_head.", ".regression_head.")
-                    state_dict[new_k] = state_dict[k]
-                    del state_dict[k]
-            else:
-                if ".regression_head." in k:
-                    del state_dict[k]
-
-        # upgrade children modules
-        super().upgrade_state_dict_named(state_dict, name)
-
-        # Handle new classification heads present in the state dict.
-        current_head_names = (
-            []
-            if not hasattr(self, "classification_heads")
-            or self.classification_heads is None
-            else self.classification_heads.keys()
-        )
-        keys_to_delete = []
-        for k in state_dict.keys():
-            if not k.startswith(prefix + "classification_heads."):
-                continue
-
-            head_name = k[len(prefix + "classification_heads.") :].split(".")[0]
-            num_classes = state_dict[
-                prefix + "classification_heads." + head_name + ".out_proj.weight"
-            ].size(0)
-            inner_dim = state_dict[
-                prefix + "classification_heads." + head_name + ".dense.weight"
-            ].size(0)
-
-            if self.cfg.load_checkpoint_heads:
-                if head_name not in current_head_names:
-                    self.register_classification_head(head_name, num_classes, inner_dim)
-            else:
-                if head_name not in current_head_names:
-                    logger.warning(
-                        "deleting classification head ({}) from checkpoint "
-                        "not present in current model: {}".format(head_name, k)
-                    )
-                    keys_to_delete.append(k)
-                elif (
-                    num_classes
-                    != self.classification_heads[head_name].out_proj.out_features
-                    or inner_dim
-                    != self.classification_heads[head_name].dense.out_features
-                ):
-                    logger.warning(
-                        "deleting classification head ({}) from checkpoint "
-                        "with different dimensions than current model: {}".format(
-                            head_name, k
-                        )
-                    )
-                    keys_to_delete.append(k)
-        for k in keys_to_delete:
-            del state_dict[k]
-
-        # Copy any newly-added classification heads into the state dict
-        # with their current weights.
-        if (
-            hasattr(self, "classification_heads")
-            and self.classification_heads is not None
-            and len(self.classification_heads) > 0
-        ):
-            cur_state = self.classification_heads.state_dict()
-            for k, v in cur_state.items():
-                if prefix + "classification_heads." + k not in state_dict:
-                    logger.info("Overwriting " + prefix + "classification_heads." + k)
-                    state_dict[prefix + "classification_heads." + k] = v
-
-            for k in list(state_dict.keys()):
-                if k.startswith(prefix + "encoder.lm_head.") or k.startswith(
-                    prefix + "encoder.emb_head."
-                ):
-                    del state_dict[k]
-
-            self.encoder.lm_head = None
-
-        if self.encoder.target_model is None:
-            for k in list(state_dict.keys()):
-                if k.startswith(prefix + "encoder.target_model."):
-                    del state_dict[k]
-
-        if (self.encoder.ema is None) and (prefix + "encoder._ema" in state_dict):
-            del state_dict[prefix + "encoder._ema"]
-
-    def remove_pretraining_modules(self, last_layer=None):
-        self.encoder.lm_head = None
-        self.encoder.regression_head = None
-        self.encoder.ema = None
-        self.classification_heads = None
-
-        if last_layer is not None:
-            self.encoder.sentence_encoder.layers = nn.ModuleList(
-                l
-                for i, l in enumerate(self.encoder.sentence_encoder.layers)
-                if i <= last_layer
-            )
-            self.encoder.sentence_encoder.layer_norm = None
-
-
-class Data2VecTextEncoder(FairseqEncoder):
-    def __init__(self, cfg: Data2VecTextConfig, dictionary, task_data):
-        super().__init__(dictionary)
-
-        # HACK
-        self.target_model = None
-
-        self.cfg = cfg
-
-        embed_tokens = self.build_embedding(
-            len(dictionary), cfg.transformer.encoder.embed_dim, dictionary.pad()
-        )
-
-        self.sentence_encoder = self.build_encoder(cfg, dictionary, embed_tokens)
-        self.mask_idx = dictionary.index("<mask>")
-        assert self.mask_idx != dictionary.unk(), dictionary.symbols
-
-        self.ema = None
-        self.average_top_k_layers = cfg.average_top_k_layers
-        self.loss_scale = cfg.loss_scale
-
-        assert self.cfg.head_layers >= 1
-
-        embed_dim = cfg.transformer.encoder.embed_dim
-        curr_dim = embed_dim
-        projs = []
-        for i in range(self.cfg.head_layers - 1):
-            next_dim = embed_dim * 2 if i == 0 else curr_dim
-            projs.append(nn.Linear(curr_dim, next_dim))
-            projs.append(nn.GELU())
-            curr_dim = next_dim
-
-        projs.append(nn.Linear(curr_dim, embed_dim))
-        self.regression_head = nn.Sequential(*projs)
-
-        self.num_updates = 0
-
-    def build_embedding(self, vocab_size, embedding_dim, padding_idx):
-        return nn.Embedding(vocab_size, embedding_dim, padding_idx)
-
-    def build_encoder(self, cfg, dictionary, embed_tokens):
-        encoder = TransformerEncoder(cfg.transformer, dictionary, embed_tokens, return_fc=True)
-        encoder.apply(init_bert_params)
-        return encoder
-
-    def build_lm_head(self, embed_dim, output_dim, activation_fn, weight):
-        return RobertaLMHead(embed_dim, output_dim, activation_fn, weight)
-
-    def make_ema_teacher(self):
-        ema_config = EMAConfig(
-            ema_decay=self.cfg.ema_decay,
-            ema_fp32=True,
-        )
-        skip_keys = set()
-        if self.cfg.ema_transformer_layers_only:
-            for k, _ in self.sentence_encoder.embed_positions.named_parameters():
-                skip_keys.add(f"embed_tokens.{k}")
-            for k, _ in self.sentence_encoder.embed_positions.named_parameters():
-                skip_keys.add(f"embed_positions.{k}")
-            if self.sentence_encoder.layernorm_embedding is not None:
-                for (
-                    k,
-                    _,
-                ) in self.sentence_encoder.layernorm_embedding.named_parameters():
-                    skip_keys.add(f"layernorm_embedding.{k}")
-            if self.sentence_encoder.layer_norm is not None:
-                for k, _ in self.sentence_encoder.layer_norm.named_parameters():
-                    skip_keys.add(f"layernorm_embedding.{k}")
-
-        self.ema = EMA(
-            self.sentence_encoder,
-            ema_config,
-            skip_keys=skip_keys,
-        )
-
-    def set_num_updates(self, num_updates):
-        super().set_num_updates(num_updates)
-
-        if self.ema is None and self.regression_head is not None:
-            logger.info(f"making ema teacher")
-            self.make_ema_teacher()
-        elif self.training and self.ema is not None:
-            if self.cfg.ema_decay != self.cfg.ema_end_decay:
-                if num_updates >= self.cfg.ema_anneal_end_step:
-                    decay = self.cfg.ema_end_decay
-                else:
-                    decay = get_annealed_rate(
-                        self.cfg.ema_decay,
-                        self.cfg.ema_end_decay,
-                        num_updates,
-                        self.cfg.ema_anneal_end_step,
-                    )
-                self.ema._set_decay(decay)
-            if self.ema.get_decay() < 1:
-                self.ema.step(self.sentence_encoder)
-
-    def state_dict(self, destination=None, prefix="", keep_vars=False):
-        state = super().state_dict(destination, prefix, keep_vars)
-        if self.ema is not None:
-            state[prefix + "_ema"] = self.ema.fp32_params
-        return state
-
-    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
-        if self.ema is not None:
-            k = prefix + "_ema"
-            assert k in state_dict
-            self.ema.restore(state_dict[k], True)
-            del state_dict[k]
-        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
-
-    def forward(
-        self,
-        src_tokens,
-        target_tokens=None,
-        features_only=False,
-        return_all_hiddens=False,
-        masked_tokens=None,
-        **unused,
-    ):
-        """
-        Args:
-            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
-            features_only (bool, optional): skip LM head and just return
-                features. If True, the output will be of shape `(batch, src_len, embed_dim)`.
-            return_all_hiddens (bool, optional): also return all of the
-                intermediate hidden states (default: False).
-
-        Returns:
-            tuple:
-                - the LM output of shape `(batch, src_len, vocab)`
-                - a dictionary of additional data, where 'inner_states' is a list of hidden states. Note that the
-                  hidden states have shape `(src_len, batch, vocab)`.
-        """
-
-        x, extra = self.extract_features(
-            src_tokens, return_all_hiddens=return_all_hiddens
-        )
-
-        if features_only:
-            return x, extra
-
-        assert target_tokens is not None
-
-        with torch.no_grad():
-            # use EMA parameter as the teacher
-            self.ema.model.eval()
-
-            encoder_out = self.ema.model(
-                target_tokens,
-                return_all_hiddens=True,
-            )
-            y = encoder_out["fc_results"]
-
-            y = y[-self.average_top_k_layers :]
-
-            permuted = False
-            if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer:
-                y = [tl.permute(1, 2, 0) for tl in y]  # TBC -> BCT
-                permuted = True
-
-            if self.cfg.batch_norm_target_layer:
-                y = [
-                    F.batch_norm(
-                        tl.float(), running_mean=None, running_var=None, training=True
-                    )
-                    for tl in y
-                ]
-
-            if self.cfg.instance_norm_target_layer:
-                y = [F.instance_norm(tl.float()) for tl in y]
-
-            if permuted:
-                y = [tl.transpose(1, 2) for tl in y]  # BCT -> BTC
-
-            if self.cfg.layer_norm_target_layer:
-                y = [F.layer_norm(tl.float(), tl.shape[-1:]) for tl in y]
-
-            y = sum(y) / len(y)
-
-            if not permuted:
-                y = y.transpose(0, 1)
-
-            if self.cfg.layer_norm_targets:
-                y = F.layer_norm(y.float(), y.shape[-1:])
-
-            if self.cfg.instance_norm_targets:
-                y = F.instance_norm(y.transpose(1, 2)).transpose(1, 2)
-
-        masked_indices = src_tokens.eq(self.mask_idx)
-
-        x = x[masked_indices]
-        y = y[masked_indices]
-
-        x = self.regression_head(x)
-
-        sz = x.size(-1)
-        if self.cfg.loss_beta == 0:
-            loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1)
-        else:
-            loss = F.smooth_l1_loss(
-                x.float(), y.float(), reduction="none", beta=self.cfg.loss_beta
-            ).sum(dim=-1)
-
-        result = {
-            "losses": {
-                "main": loss.sum() / math.sqrt(sz)
-                if self.loss_scale <= 0
-                else loss.sum() * self.loss_scale,
-            },
-            "sample_size": loss.numel(),
-        }
-
-        # logging other values
-        other_logs = {
-            "ema_decay": self.ema.get_decay() * 1000
-        }
-        result["logs"] = other_logs
-        return result
-
-    def extract_features(self, src_tokens, return_all_hiddens=False, **kwargs):
-        encoder_out = self.sentence_encoder(
-            src_tokens,
-            return_all_hiddens=return_all_hiddens,
-            token_embeddings=kwargs.get("token_embeddings", None),
-        )
-        # T x B x C -> B x T x C
-        features = encoder_out["encoder_out"][0].transpose(0, 1)
-        inner_states = encoder_out["encoder_states"] if return_all_hiddens else None
-        return features, {
-            "inner_states": inner_states,
-            "encoder_embedding": encoder_out["encoder_embedding"][0],
-        }
-
-    def output_layer(self, features, masked_tokens=None, **unused):
-        return self.lm_head(features, masked_tokens)
-
-    def max_positions(self):
-        """Maximum output length supported by the encoder."""
-        return self.cfg.max_positions

From e6704613aab78c8ea813c6b5eb4d17911df971c4 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 15:32:52 -0500
Subject: [PATCH 060/121] Add comment on where to get data2vec_text.py

---
 ...convert_data2vec_original_pytorch_checkpoint_to_pytorch.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
index bc4e41e4e058a..9861e4a6f451e 100644
--- a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
@@ -34,7 +34,9 @@
 )
 from transformers.utils import logging
 
-from .data2vec_text import Data2VecTextModel
+from .data2vec_text import (
+    Data2VecTextModel,  # Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
+)
 
 
 if version.parse(fairseq.__version__) < version.parse("0.9.0"):

From 3d21d60b19e68f82907a3cc3206a25005d835f97 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 15:37:16 -0500
Subject: [PATCH 061/121] Remove mock implementation cheat.py and fix style

---
 src/transformers/models/data2vec/cheat.py     | 40 -------------------
 .../models/data2vec/configuration_data2vec.py |  1 -
 2 files changed, 41 deletions(-)
 delete mode 100644 src/transformers/models/data2vec/cheat.py

diff --git a/src/transformers/models/data2vec/cheat.py b/src/transformers/models/data2vec/cheat.py
deleted file mode 100644
index b043bef5a3547..0000000000000
--- a/src/transformers/models/data2vec/cheat.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import os
-import tarfile
-import urllib.request
-
-# Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
-from .data2vec_text import Data2VecTextModel
-
-
-class Data2VecFairseqProxy():
-    def __init__(self, module):
-        self.module = module
-
-    @classmethod
-    def from_pretrained(cls, mname):
-        ckpt = f"{mname}.pt"
-        cls._download_weights(model=ckpt)
-        return cls(Data2VecTextModel.from_pretrained("roberta/roberta.large", checkpoint_file=ckpt))
-    
-    @staticmethod
-    def _download_weights(model: str="nlp_base.pt"):
-        assert model in ("nlp_base.pt", "audio_base_ls.pt"), "Weights not found"
-        root_url = "https://dl.fbaipublicfiles.com/fairseq"
-
-        if model == "nlp_base.pt":
-            # Need download RoBERTa first to get the dictionary file
-            if not os.path.isdir("roberta"):
-                print("Downloading roberta")
-                urllib.request.urlretrieve(f"{root_url}/models/roberta.large.tar.gz", "roberta.large.tar.gz")
-                with tarfile.open("roberta.large.tar.gz") as f:
-                    f.extractall("roberta")
-                # Remove Roberta model weights and tar file
-                os.remove(os.path.join("roberta", "roberta.large", "model.pt"))
-                os.remove(os.path.join("roberta.large.tar.gz"))
-
-        # Then download the actual data2vec weights
-        model_url = f"{root_url}/data2vec/{model}"
-        model_path = os.path.join("roberta", "roberta.large", model)
-        if not os.path.isfile(model_path):
-            print("Downloading model...")
-            urllib.request.urlretrieve(model_url, model_path)
diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index 43a80373ae748..f4929e2854d16 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -29,7 +29,6 @@
 }
 
 
-
 class Data2VecConfig(BertConfig):
     r"""
     This is the configuration class to store the configuration of a [`Data2VecModel`] or a [`TFData2VecModel`]. It is

From 16c8361c82784e171e2c825ea175696d01658f1c Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 15:41:26 -0500
Subject: [PATCH 062/121] Fix copies

---
 src/transformers/models/data2vec/modeling_data2vec.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index dbdfcd447ca55..172958898535b 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -952,13 +952,13 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import RobertaTokenizer, Data2VecForSequenceClassification, Data2VecConfig
+        >>> from transformers import Data2VecTokenizer, Data2VecForCausalLM, Data2VecConfig
         >>> import torch
 
-        >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
-        >>> config = Data2VecConfig.from_pretrained("edugp/data2vec-nlp-base")
-        >>> model = Data2VecForSequenceClassification.from_pretrained("edugp/data2vec-nlp-base", config=config)
-        >>> # Fine-tune this model
+        >>> tokenizer = Data2VecTokenizer.from_pretrained("data2vec-base")
+        >>> config = Data2VecConfig.from_pretrained("data2vec-base")
+        >>> config.is_decoder = True
+        >>> model = Data2VecForCausalLM.from_pretrained("data2vec-base", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)

From 4dc16f3aa9d5a7f48c37aa57ec58dc370052da1b Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 15:45:14 -0500
Subject: [PATCH 063/121] Remove TF and Flax classes from init

---
 docs/source/index.mdx        |  2 +-
 src/transformers/__init__.py | 25 -------------------------
 2 files changed, 1 insertion(+), 26 deletions(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 7d10f48a53c07..278b75db176e7 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -181,7 +181,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNext           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          data2vec           |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          data2vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5a28c5b54fa37..326739e00ec09 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1766,20 +1766,6 @@
             "TFCTRLPreTrainedModel",
         ]
     )
-    _import_structure["models.data2vec"].extend(
-        [
-            "TF_DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "TFData2VecForCausalLM",
-            "TFData2VecForMaskedLM",
-            "TFData2VecForMultipleChoice",
-            "TFData2VecForQuestionAnswering",
-            "TFData2VecForSequenceClassification",
-            "TFData2VecForTokenClassification",
-            "TFData2VecMainLayer",
-            "TFData2VecModel",
-            "TFData2VecPreTrainedModel",
-        ]
-    )
     _import_structure["models.deberta"].extend(
         [
             "TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2241,17 +2227,6 @@
             "FlaxCLIPVisionPreTrainedModel",
         ]
     )
-    _import_structure["models.data2vec"].extend(
-        [
-            "FlaxData2VecForMaskedLM",
-            "FlaxData2VecForMultipleChoice",
-            "FlaxData2VecForQuestionAnswering",
-            "FlaxData2VecForSequenceClassification",
-            "FlaxData2VecForTokenClassification",
-            "FlaxData2VecModel",
-            "FlaxData2VecPreTrainedModel",
-        ]
-    )
     _import_structure["models.distilbert"].extend(
         [
             "FlaxDistilBertForMaskedLM",

From 3f1efe13608fc5aa997f3bc18d1493111f120ea9 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 15:57:26 -0500
Subject: [PATCH 064/121] Add back copy from fairseq data2vec_text.py and fix
 style

---
 ..._original_pytorch_checkpoint_to_pytorch.py |   5 +-
 .../models/data2vec/data2vec_text.py          | 472 ++++++++++++++++++
 2 files changed, 474 insertions(+), 3 deletions(-)
 create mode 100644 src/transformers/models/data2vec/data2vec_text.py

diff --git a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
index 9861e4a6f451e..973af5144bcb6 100644
--- a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
@@ -34,9 +34,8 @@
 )
 from transformers.utils import logging
 
-from .data2vec_text import (
-    Data2VecTextModel,  # Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
-)
+# File copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
+from .data2vec_text import Data2VecTextModel
 
 
 if version.parse(fairseq.__version__) < version.parse("0.9.0"):
diff --git a/src/transformers/models/data2vec/data2vec_text.py b/src/transformers/models/data2vec/data2vec_text.py
new file mode 100644
index 0000000000000..d8cef5f342224
--- /dev/null
+++ b/src/transformers/models/data2vec/data2vec_text.py
@@ -0,0 +1,472 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.configs import EMAConfig
+from fairseq.models import FairseqEncoder, FairseqEncoderModel, register_model
+from fairseq.models.ema import EMA
+from fairseq.models.roberta.model import RobertaClassificationHead, RobertaLMHead
+from fairseq.models.transformer import TransformerConfig, TransformerEncoder
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+from omegaconf import II
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Data2VecTextConfig(FairseqDataclass):
+    max_positions: int = II("task.tokens_per_sample")
+
+    head_layers: int = 1
+
+    transformer: TransformerConfig = TransformerConfig()
+
+    load_checkpoint_heads: bool = field(
+        default=False,
+        metadata={"help": "(re-)register and load heads when loading checkpoints"},
+    )
+
+    loss_beta: float = field(default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"})
+    loss_scale: Optional[float] = field(
+        default=None,
+        metadata={"help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"},
+    )
+    average_top_k_layers: int = field(default=8, metadata={"help": "how many layers to average"})
+
+    layer_norm_target_layer: bool = False
+    instance_norm_target_layer: bool = False
+    batch_norm_target_layer: bool = False
+    instance_norm_targets: bool = False
+    layer_norm_targets: bool = False
+
+    ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"})
+    ema_end_decay: float = field(default=0.9999, metadata={"help": "final ema decay rate"})
+
+    # when to finish annealing ema decay rate
+    ema_anneal_end_step: int = II("optimization.max_update")
+
+    ema_transformer_layers_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer layers"},
+    )
+
+
+def get_annealed_rate(start, end, curr_step, total_steps):
+    r = end - start
+    pct_remaining = 1 - curr_step / total_steps
+    return end - r * pct_remaining
+
+
+@register_model("data2vec_text", dataclass=Data2VecTextConfig)
+class Data2VecTextModel(FairseqEncoderModel):
+    def __init__(self, cfg: Data2VecTextConfig, encoder):
+        super().__init__(encoder)
+        self.cfg = cfg
+
+        # We follow BERT's random weight initialization
+        self.apply(init_bert_params)
+
+        self.classification_heads = nn.ModuleDict()
+
+    @classmethod
+    def build_model(cls, cfg, task):
+        """Build a new model instance."""
+
+        encoder = Data2VecTextEncoder(cfg, task.source_dictionary, task.cfg.data)
+
+        return cls(cfg, encoder)
+
+    def forward(
+        self,
+        src_tokens,
+        target_tokens=None,
+        features_only=False,
+        return_all_hiddens=False,
+        classification_head_name=None,
+        **kwargs,
+    ):
+        if classification_head_name is not None:
+            features_only = True
+
+        res = self.encoder(src_tokens, target_tokens, features_only, return_all_hiddens, **kwargs)
+
+        if isinstance(res, tuple):
+            x, extra = res
+        else:
+            return res
+
+        if classification_head_name is not None:
+            x = self.classification_heads[classification_head_name](x)
+        return x, extra
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        logits = net_output[0].float()
+        if log_probs:
+            return F.log_softmax(logits, dim=-1)
+        else:
+            return F.softmax(logits, dim=-1)
+
+    def register_classification_head(self, name, num_classes=None, inner_dim=None, **kwargs):
+        """Register a classification head."""
+        if name in self.classification_heads:
+            prev_num_classes = self.classification_heads[name].out_proj.out_features
+            prev_inner_dim = self.classification_heads[name].dense.out_features
+            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
+                logger.warning(
+                    're-registering head "{}" with num_classes {} (prev: {}) '
+                    "and inner_dim {} (prev: {})".format(
+                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
+                    )
+                )
+        self.classification_heads[name] = RobertaClassificationHead(
+            input_dim=self.cfg.transformer.encoder.embed_dim,
+            inner_dim=inner_dim or self.cfg.transformer.encoder.embed_dim,
+            num_classes=num_classes,
+            activation_fn="tanh",
+            pooler_dropout=0,
+        )
+
+    @property
+    def supported_targets(self):
+        return {"self"}
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+
+        # rename decoder -> encoder before upgrading children modules
+        for k in list(state_dict.keys()):
+            if k.startswith(prefix + "decoder"):
+                new_k = prefix + "encoder" + k[len(prefix + "decoder") :]
+                state_dict[new_k] = state_dict[k]
+                del state_dict[k]
+
+        # rename emb_layer_norm -> layernorm_embedding
+        for k in list(state_dict.keys()):
+            if ".emb_layer_norm." in k:
+                new_k = k.replace(".emb_layer_norm.", ".layernorm_embedding.")
+                state_dict[new_k] = state_dict[k]
+                del state_dict[k]
+
+            if self.encoder.regression_head is not None:
+                if ".lm_head." in k:
+                    new_k = k.replace(".lm_head.", ".regression_head.")
+                    state_dict[new_k] = state_dict[k]
+                    del state_dict[k]
+            else:
+                if ".regression_head." in k:
+                    del state_dict[k]
+
+        # upgrade children modules
+        super().upgrade_state_dict_named(state_dict, name)
+
+        # Handle new classification heads present in the state dict.
+        current_head_names = (
+            []
+            if not hasattr(self, "classification_heads") or self.classification_heads is None
+            else self.classification_heads.keys()
+        )
+        keys_to_delete = []
+        for k in state_dict.keys():
+            if not k.startswith(prefix + "classification_heads."):
+                continue
+
+            head_name = k[len(prefix + "classification_heads.") :].split(".")[0]
+            num_classes = state_dict[prefix + "classification_heads." + head_name + ".out_proj.weight"].size(0)
+            inner_dim = state_dict[prefix + "classification_heads." + head_name + ".dense.weight"].size(0)
+
+            if self.cfg.load_checkpoint_heads:
+                if head_name not in current_head_names:
+                    self.register_classification_head(head_name, num_classes, inner_dim)
+            else:
+                if head_name not in current_head_names:
+                    logger.warning(
+                        "deleting classification head ({}) from checkpoint "
+                        "not present in current model: {}".format(head_name, k)
+                    )
+                    keys_to_delete.append(k)
+                elif (
+                    num_classes != self.classification_heads[head_name].out_proj.out_features
+                    or inner_dim != self.classification_heads[head_name].dense.out_features
+                ):
+                    logger.warning(
+                        "deleting classification head ({}) from checkpoint "
+                        "with different dimensions than current model: {}".format(head_name, k)
+                    )
+                    keys_to_delete.append(k)
+        for k in keys_to_delete:
+            del state_dict[k]
+
+        # Copy any newly-added classification heads into the state dict
+        # with their current weights.
+        if (
+            hasattr(self, "classification_heads")
+            and self.classification_heads is not None
+            and len(self.classification_heads) > 0
+        ):
+            cur_state = self.classification_heads.state_dict()
+            for k, v in cur_state.items():
+                if prefix + "classification_heads." + k not in state_dict:
+                    logger.info("Overwriting " + prefix + "classification_heads." + k)
+                    state_dict[prefix + "classification_heads." + k] = v
+
+            for k in list(state_dict.keys()):
+                if k.startswith(prefix + "encoder.lm_head.") or k.startswith(prefix + "encoder.emb_head."):
+                    del state_dict[k]
+
+            self.encoder.lm_head = None
+
+        if self.encoder.target_model is None:
+            for k in list(state_dict.keys()):
+                if k.startswith(prefix + "encoder.target_model."):
+                    del state_dict[k]
+
+        if (self.encoder.ema is None) and (prefix + "encoder._ema" in state_dict):
+            del state_dict[prefix + "encoder._ema"]
+
+    def remove_pretraining_modules(self, last_layer=None):
+        self.encoder.lm_head = None
+        self.encoder.regression_head = None
+        self.encoder.ema = None
+        self.classification_heads = None
+
+        if last_layer is not None:
+            self.encoder.sentence_encoder.layers = nn.ModuleList(
+                l for i, l in enumerate(self.encoder.sentence_encoder.layers) if i <= last_layer
+            )
+            self.encoder.sentence_encoder.layer_norm = None
+
+
+class Data2VecTextEncoder(FairseqEncoder):
+    def __init__(self, cfg: Data2VecTextConfig, dictionary, task_data):
+        super().__init__(dictionary)
+
+        self.cfg = cfg
+
+        embed_tokens = self.build_embedding(len(dictionary), cfg.transformer.encoder.embed_dim, dictionary.pad())
+
+        self.sentence_encoder = self.build_encoder(cfg, dictionary, embed_tokens)
+        self.mask_idx = dictionary.index("<mask>")
+        assert self.mask_idx != dictionary.unk(), dictionary.symbols
+
+        self.ema = None
+        self.average_top_k_layers = cfg.average_top_k_layers
+        self.loss_scale = cfg.loss_scale
+
+        assert self.cfg.head_layers >= 1
+
+        embed_dim = cfg.transformer.encoder.embed_dim
+        curr_dim = embed_dim
+        projs = []
+        for i in range(self.cfg.head_layers - 1):
+            next_dim = embed_dim * 2 if i == 0 else curr_dim
+            projs.append(nn.Linear(curr_dim, next_dim))
+            projs.append(nn.GELU())
+            curr_dim = next_dim
+
+        projs.append(nn.Linear(curr_dim, embed_dim))
+        self.regression_head = nn.Sequential(*projs)
+
+        self.num_updates = 0
+
+    def build_embedding(self, vocab_size, embedding_dim, padding_idx):
+        return nn.Embedding(vocab_size, embedding_dim, padding_idx)
+
+    def build_encoder(self, cfg, dictionary, embed_tokens):
+        encoder = TransformerEncoder(cfg.transformer, dictionary, embed_tokens, return_fc=True)
+        encoder.apply(init_bert_params)
+        return encoder
+
+    def build_lm_head(self, embed_dim, output_dim, activation_fn, weight):
+        return RobertaLMHead(embed_dim, output_dim, activation_fn, weight)
+
+    def make_ema_teacher(self):
+        ema_config = EMAConfig(
+            ema_decay=self.cfg.ema_decay,
+            ema_fp32=True,
+        )
+        skip_keys = set()
+        if self.cfg.ema_transformer_layers_only:
+            for k, _ in self.sentence_encoder.embed_positions.named_parameters():
+                skip_keys.add(f"embed_tokens.{k}")
+            for k, _ in self.sentence_encoder.embed_positions.named_parameters():
+                skip_keys.add(f"embed_positions.{k}")
+            if self.sentence_encoder.layernorm_embedding is not None:
+                for (
+                    k,
+                    _,
+                ) in self.sentence_encoder.layernorm_embedding.named_parameters():
+                    skip_keys.add(f"layernorm_embedding.{k}")
+            if self.sentence_encoder.layer_norm is not None:
+                for k, _ in self.sentence_encoder.layer_norm.named_parameters():
+                    skip_keys.add(f"layernorm_embedding.{k}")
+
+        self.ema = EMA(
+            self.sentence_encoder,
+            ema_config,
+            skip_keys=skip_keys,
+        )
+
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+
+        if self.ema is None and self.regression_head is not None:
+            logger.info("making ema teacher")
+            self.make_ema_teacher()
+        elif self.training and self.ema is not None:
+            if self.cfg.ema_decay != self.cfg.ema_end_decay:
+                if num_updates >= self.cfg.ema_anneal_end_step:
+                    decay = self.cfg.ema_end_decay
+                else:
+                    decay = get_annealed_rate(
+                        self.cfg.ema_decay,
+                        self.cfg.ema_end_decay,
+                        num_updates,
+                        self.cfg.ema_anneal_end_step,
+                    )
+                self.ema._set_decay(decay)
+            if self.ema.get_decay() < 1:
+                self.ema.step(self.sentence_encoder)
+
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        state = super().state_dict(destination, prefix, keep_vars)
+        if self.ema is not None:
+            state[prefix + "_ema"] = self.ema.fp32_params
+        return state
+
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        if self.ema is not None:
+            k = prefix + "_ema"
+            assert k in state_dict
+            self.ema.restore(state_dict[k], True)
+            del state_dict[k]
+        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+    def forward(
+        self,
+        src_tokens,
+        target_tokens=None,
+        features_only=False,
+        return_all_hiddens=False,
+        masked_tokens=None,
+        **unused,
+    ):
+        """
+        Args:
+            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
+            features_only (bool, optional): skip LM head and just return
+                features. If True, the output will be of shape `(batch, src_len, embed_dim)`.
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+
+        Returns:
+            tuple:
+                - the LM output of shape `(batch, src_len, vocab)`
+                - a dictionary of additional data, where 'inner_states' is a list of hidden states. Note that the
+                  hidden states have shape `(src_len, batch, vocab)`.
+        """
+
+        x, extra = self.extract_features(src_tokens, return_all_hiddens=return_all_hiddens)
+
+        if features_only:
+            return x, extra
+
+        assert target_tokens is not None
+
+        with torch.no_grad():
+            # use EMA parameter as the teacher
+            self.ema.model.eval()
+
+            encoder_out = self.ema.model(
+                target_tokens,
+                return_all_hiddens=True,
+            )
+            y = encoder_out["fc_results"]
+
+            y = y[-self.average_top_k_layers :]
+
+            permuted = False
+            if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer:
+                y = [tl.permute(1, 2, 0) for tl in y]  # TBC -> BCT
+                permuted = True
+
+            if self.cfg.batch_norm_target_layer:
+                y = [F.batch_norm(tl.float(), running_mean=None, running_var=None, training=True) for tl in y]
+
+            if self.cfg.instance_norm_target_layer:
+                y = [F.instance_norm(tl.float()) for tl in y]
+
+            if permuted:
+                y = [tl.transpose(1, 2) for tl in y]  # BCT -> BTC
+
+            if self.cfg.layer_norm_target_layer:
+                y = [F.layer_norm(tl.float(), tl.shape[-1:]) for tl in y]
+
+            y = sum(y) / len(y)
+
+            if not permuted:
+                y = y.transpose(0, 1)
+
+            if self.cfg.layer_norm_targets:
+                y = F.layer_norm(y.float(), y.shape[-1:])
+
+            if self.cfg.instance_norm_targets:
+                y = F.instance_norm(y.transpose(1, 2)).transpose(1, 2)
+
+        masked_indices = src_tokens.eq(self.mask_idx)
+
+        x = x[masked_indices]
+        y = y[masked_indices]
+
+        x = self.regression_head(x)
+
+        sz = x.size(-1)
+        if self.cfg.loss_beta == 0:
+            loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1)
+        else:
+            loss = F.smooth_l1_loss(x.float(), y.float(), reduction="none", beta=self.cfg.loss_beta).sum(dim=-1)
+
+        result = {
+            "losses": {
+                "main": loss.sum() / math.sqrt(sz) if self.loss_scale <= 0 else loss.sum() * self.loss_scale,
+            },
+            "sample_size": loss.numel(),
+        }
+
+        # logging other values
+        other_logs = {"ema_decay": self.ema.get_decay() * 1000}
+        result["logs"] = other_logs
+        return result
+
+    def extract_features(self, src_tokens, return_all_hiddens=False, **kwargs):
+        encoder_out = self.sentence_encoder(
+            src_tokens,
+            return_all_hiddens=return_all_hiddens,
+            token_embeddings=kwargs.get("token_embeddings", None),
+        )
+        # T x B x C -> B x T x C
+        features = encoder_out["encoder_out"][0].transpose(0, 1)
+        inner_states = encoder_out["encoder_states"] if return_all_hiddens else None
+        return features, {
+            "inner_states": inner_states,
+            "encoder_embedding": encoder_out["encoder_embedding"][0],
+        }
+
+    def output_layer(self, features, masked_tokens=None, **unused):
+        return self.lm_head(features, masked_tokens)
+
+    def max_positions(self):
+        """Maximum output length supported by the encoder."""
+        return self.cfg.max_positions

From bf8dd78ff1b50a4031171d964dc51c6c21f42d3e Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 16:03:19 -0500
Subject: [PATCH 065/121] Update model name in docs/source/index.mdx to be
 CamelCase

---
 docs/source/index.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 278b75db176e7..c4a42a4de4192 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -181,7 +181,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNext           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          data2vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Data2Vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |

From 79177661fffd872d2cfb8cfa6e9325f0cc430bef Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sat, 5 Feb 2022 16:12:16 -0500
Subject: [PATCH 066/121] Revert model name in table to lower-case to get
 check_table test to pass

---
 docs/source/index.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index c4a42a4de4192..278b75db176e7 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -181,7 +181,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNext           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          Data2Vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          data2vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |

From cdbb4b76a92643986fa41d0bb3370b25353bd2c1 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Mon, 7 Feb 2022 20:21:16 -0500
Subject: [PATCH 067/121] Update documentation

---
 docs/source/model_doc/data2vec.mdx | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
index a9fc0caff33f0..21ced12bd836f 100644
--- a/docs/source/model_doc/data2vec.mdx
+++ b/docs/source/model_doc/data2vec.mdx
@@ -1,4 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -10,23 +10,34 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# data2vec
+# Data2Vec
 
 ## Overview
 
-The data2vec model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
+The Data2Vec model was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://scontent-sjc3-1.xx.fbcdn.net/v/t39.8562-6/271974914_483120576492438_4239522333319653600_n.pdf?_nc_cat=107&ccb=1-5&_nc_sid=ae5e01&_nc_ohc=7huShTb_QZIAX-N7SYx&_nc_ht=scontent-sjc3-1.xx&oh=00_AT_lXXL69mjqmdVWbaLh4Ro6DY17aFeO5vA9I-mIpyNieg&oe=6205C411) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and Michael Auli.
+Data2Vec proposes a unified framework for self-supervised learning across different data modalities - text, audio and images.
+Importantly, predicted targets for pre-training are contextualized latent representations of the inputs, rather than modality-specific, context-independent targets.
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+*While the general idea of self-supervised learning is identical across modalities, the actual algorithms and
+objectives differ widely because they were developed with a single modality in mind. To get us closer to general
+self-supervised learning, we present data2vec, a framework that uses the same learning method for either speech,
+NLP or computer vision. The core idea is to predict latent representations of the full input data based on a
+masked view of the input in a selfdistillation setup using a standard Transformer architecture.
+Instead of predicting modality-specific targets such as words, visual tokens or units of human speech which
+are local in nature, data2vec predicts contextualized latent representations that contain information from
+the entire input. Experiments on the major benchmarks of speech recognition, image classification, and
+natural language understanding demonstrate a new state of the art or competitive performance to predominant approaches.
+Models and code are available at www.github.com/pytorch/fairseq/tree/master/examples/data2vec.*
 
 Tips:
 
-<INSERT TIPS ABOUT MODEL HERE>
+- This implementation has a shared encoder for all different modalities and different pre-processors for each modality.
+  For example, in the case of text, preprocessing is identical to [`RobertaModel`], including tokenization.
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+This model was contributed by [edugp](https://huggingface.co/edugp).
+The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/data2vec).
 
 
 ## Data2VecConfig

From 578cfd818f4001984db1c5f81f5de2034e388349 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:09:53 -0800
Subject: [PATCH 068/121] Update src/transformers/models/data2vec/__init__.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/data2vec/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
index 63b7945d90f38..312a1bf2db7f0 100644
--- a/src/transformers/models/data2vec/__init__.py
+++ b/src/transformers/models/data2vec/__init__.py
@@ -2,7 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.
 
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From cda97e973c92bb8e37369fed73055278e46157c7 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:10:07 -0800
Subject: [PATCH 069/121] Update
 src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../convert_data2vec_original_pytorch_checkpoint_to_pytorch.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
index 973af5144bcb6..58d09a9d2b09e 100644
--- a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
+# Copyright 2022 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 03b41fc9d90931effc4f31606bef5e67eaf79ddb Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:10:17 -0800
Subject: [PATCH 070/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 172958898535b..336e47c0dbbaa 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright 2022 The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

From 244e52e1005c86ff499950faf314790e06e244e8 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:10:29 -0800
Subject: [PATCH 071/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 336e47c0dbbaa..405ba59e2d5b2 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -1,6 +1,5 @@
 # coding=utf-8
 # Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From d4c82e27370c3ec4897b7caa38fb75325e1f3189 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:11:04 -0800
Subject: [PATCH 072/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 405ba59e2d5b2..0ca4f514da055 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -67,7 +67,6 @@ class Data2VecEmbeddings(nn.Module):
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
 
-    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
     def __init__(self, config):
         super().__init__()
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)

From 79eaf6cba67db842e4ebd7fbbef0592092bc6790 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:11:09 -0800
Subject: [PATCH 073/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 0ca4f514da055..a627ed82dacc7 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -87,7 +87,6 @@ def __init__(self, config):
                 persistent=False,
             )
 
-        # End copy
         self.padding_idx = config.pad_token_id
         self.position_embeddings = nn.Embedding(
             config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx

From 17f5f86ae9345c3b5ae85a062dde87a24c46f844 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:13:14 -0800
Subject: [PATCH 074/121] Update
 src/transformers/models/auto/configuration_auto.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/auto/configuration_auto.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index ed5844ae35e33..df83f2b872798 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -272,7 +272,7 @@
         ("reformer", "Reformer"),
         ("longformer", "Longformer"),
         ("roberta", "RoBERTa"),
-        ("data2vec", "data2vec"),
+        ("data2vec", "Data2Vec"),
         ("flaubert", "FlauBERT"),
         ("fsmt", "FairSeq Machine-Translation"),
         ("squeezebert", "SqueezeBERT"),

From 309a43df73be8b3663bbd5d8f31d10c9077de761 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:13:22 -0800
Subject: [PATCH 075/121] Update
 src/transformers/models/data2vec/configuration_data2vec.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/data2vec/configuration_data2vec.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index f4929e2854d16..b24a27a73538e 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -1,6 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From acd9cd4901fb8dec2e90251e58fb34c11c71e245 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:13:30 -0800
Subject: [PATCH 076/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index a627ed82dacc7..33e5b067c6786 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch data2vec model."""
+"""PyTorch Data2Vec model."""
 
 import math
 

From c306788fb250c7150c1381e27c169d7f9ff6732f Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:13:42 -0800
Subject: [PATCH 077/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 33e5b067c6786..39c85fa11a107 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -683,7 +683,7 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
 
 
 @add_start_docstrings(
-    "The bare data2vec Model transformer outputting raw hidden-states without any specific head on top.",
+    "The bare Data2Vec Model transformer outputting raw hidden-states without any specific head on top.",
     DATA2VEC_START_DOCSTRING,
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaModel with ROBERTA->DATA2VEC,Roberta->Data2Vec

From b440a4b027023319f5e7bfdb195b33e035861c3b Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:14:05 -0800
Subject: [PATCH 078/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 39c85fa11a107..2b5bd2c043434 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -1152,7 +1152,7 @@ def _tie_weights(self):
 
 @add_start_docstrings(
     """
-    data2vec Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    Data2Vec Model transformer with a sequence classification/regression head on top (a linear layer on top of the
     pooled output) e.g. for GLUE tasks.
     """,
     DATA2VEC_START_DOCSTRING,

From 835ca7fb4efa0fe92277af7580c4f8015c6b9864 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:14:11 -0800
Subject: [PATCH 079/121] Update tests/test_modeling_data2vec.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 tests/test_modeling_data2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_modeling_data2vec.py b/tests/test_modeling_data2vec.py
index 69a759b4c6b11..4a59ba45c3a27 100644
--- a/tests/test_modeling_data2vec.py
+++ b/tests/test_modeling_data2vec.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From ab52bbff9f3ad805ee469d6a2ea8be0d0ef32d79 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:15:24 -0800
Subject: [PATCH 080/121] Update
 src/transformers/models/data2vec/configuration_data2vec.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/data2vec/configuration_data2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index b24a27a73538e..706ebb8f61099 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" data2vec configuration"""
+""" Data2Vec configuration"""
 from collections import OrderedDict
 from typing import Mapping
 

From ec8d4b3d02f366b23d0b2928b12fc20809e8c9b6 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Mon, 7 Feb 2022 15:15:34 -0800
Subject: [PATCH 081/121] Update
 src/transformers/models/data2vec/modeling_data2vec.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/data2vec/modeling_data2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 2b5bd2c043434..5956cac074afa 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -871,7 +871,7 @@ def forward(
 
 
 @add_start_docstrings(
-    """data2vec Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VEC_START_DOCSTRING
+    """Data2Vec Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VEC_START_DOCSTRING
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec,roberta-base->data2vec
 class Data2VecForCausalLM(Data2VecPreTrainedModel):

From 5c98ce25a0bc36a8a28e5b3f86fd94212520d711 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Tue, 8 Feb 2022 20:56:16 -0500
Subject: [PATCH 082/121] Copy-paste Data2VecConfig from BertConfig

---
 .../models/data2vec/configuration_data2vec.py | 99 ++++++++++++++++---
 1 file changed, 86 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index 706ebb8f61099..c058a37a634bd 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -16,9 +16,9 @@
 from collections import OrderedDict
 from typing import Mapping
 
+from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
 from ...utils import logging
-from ..bert.configuration_bert import BertConfig
 
 
 logger = logging.get_logger(__name__)
@@ -28,27 +28,66 @@
 }
 
 
-class Data2VecConfig(BertConfig):
+class Data2VecConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Data2VecModel`] or a [`TFData2VecModel`]. It is
-    used to instantiate a data2vec model according to the specified arguments, defining the model architecture.
-
+    This is the configuration class to store the configuration of a [`Data2VecModel`], identical to [`BertConfig].
+    It is used to instantiate a Data2Vec model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2Vec
+    [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
-    The [`Data2VecConfig`] class directly inherits [`BertConfig`]. It reuses the same defaults. Please check the parent
-    class for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
 
     Examples:
 
     ```python
-    >>> from transformers import Data2VecConfig, Data2VecModel
+    >>> from transformers import Data2VecModel, Data2VecConfig
 
-    >>> # Initializing a data2vec configuration
+    >>> # Initializing a Data2Vec edugp/data2vec-nlp-base style configuration
     >>> configuration = Data2VecConfig()
 
-    >>> # Initializing a model from the configuration
+    >>> # Initializing a model from the edugp/data2vec-nlp-base style configuration
     >>> model = Data2VecModel(configuration)
 
     >>> # Accessing the model configuration
@@ -56,9 +95,43 @@ class for more information.
     ```"""
     model_type = "data2vec"
 
-    def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
-        """Constructs Data2VecConfig."""
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
 
 
 class Data2VecOnnxConfig(OnnxConfig):

From f8880bc49f46b5680dff0a66767cd346de8b01b5 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Tue, 8 Feb 2022 21:17:35 -0500
Subject: [PATCH 083/121] Update config checkpoint to point to
 edugp/data2vec-nlp-base. Fix style and repo-consistency

---
 docs/source/index.mdx                                      | 2 +-
 docs/source/serialization.mdx                              | 2 +-
 src/transformers/models/data2vec/configuration_data2vec.py | 6 +++---
 src/transformers/models/data2vec/modeling_data2vec.py      | 2 ++
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 278b75db176e7..c4a42a4de4192 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -181,7 +181,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNext           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          data2vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Data2Vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/serialization.mdx b/docs/source/serialization.mdx
index 302ee51e14019..c77a77114e5f8 100644
--- a/docs/source/serialization.mdx
+++ b/docs/source/serialization.mdx
@@ -49,7 +49,7 @@ Ready-made configurations include the following architectures:
 - BART
 - BERT
 - CamemBERT
-- data2vec
+- Data2Vec
 - DistilBERT
 - ELECTRA
 - GPT Neo
diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index c058a37a634bd..8dcd44372c313 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -24,14 +24,14 @@
 logger = logging.get_logger(__name__)
 
 DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "data2vec": "https://huggingface.co/data2vec/resolve/main/config.json",
+    "edugp/data2vec-nlp-base": "https://huggingface.co/data2vec/resolve/main/config.json",
 }
 
 
 class Data2VecConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Data2VecModel`], identical to [`BertConfig].
-    It is used to instantiate a Data2Vec model according to the specified arguments, defining the model architecture.
+    This is the configuration class to store the configuration of a [`Data2VecModel`], identical to [`BertConfig]. It
+    is used to instantiate a Data2Vec model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2Vec
     [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
 
diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 5956cac074afa..37c7354d9432a 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -67,6 +67,7 @@ class Data2VecEmbeddings(nn.Module):
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
 
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
     def __init__(self, config):
         super().__init__()
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
@@ -87,6 +88,7 @@ def __init__(self, config):
                 persistent=False,
             )
 
+        # End copy
         self.padding_idx = config.pad_token_id
         self.position_embeddings = nn.Embedding(
             config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx

From 0a0a8de2e35e2bbd588d2824beb9c9169665ac85 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Tue, 8 Feb 2022 21:40:24 -0500
Subject: [PATCH 084/121] Update config special tokens to match RoBERTa

---
 src/transformers/models/data2vec/configuration_data2vec.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index 8dcd44372c313..ae56227dd921b 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -94,7 +94,6 @@ class Data2VecConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
     model_type = "data2vec"
-
     def __init__(
         self,
         vocab_size=30522,
@@ -109,13 +108,15 @@ def __init__(
         type_vocab_size=2,
         initializer_range=0.02,
         layer_norm_eps=1e-12,
-        pad_token_id=0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
         **kwargs
     ):
-        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size

From 2b26499afbd2f06b6accd4de77a8d1fec2ebb446 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Tue, 8 Feb 2022 21:41:25 -0500
Subject: [PATCH 085/121] Split multiple assertions and add individual error
 messages

---
 ..._original_pytorch_checkpoint_to_pytorch.py | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
index 58d09a9d2b09e..4173bc3085c5e 100644
--- a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
@@ -95,12 +95,15 @@ def convert_data2vec_checkpoint_to_pytorch(
 
         # self attention
         self_attn: BertSelfAttention = layer.attention.self
-        assert (
-            data2vec_layer.self_attn.k_proj.weight.data.shape
-            == data2vec_layer.self_attn.q_proj.weight.data.shape
-            == data2vec_layer.self_attn.v_proj.weight.data.shape
-            == torch.Size((config.hidden_size, config.hidden_size))
-        )
+        assert data2vec_layer.self_attn.k_proj.weight.data.shape == torch.Size(
+            (config.hidden_size, config.hidden_size)
+        ), f"Shape for data2vec_layer.self_attn.k_proj.weight.data should be {torch.Size((config.hidden_size, config.hidden_size))}"
+        assert data2vec_layer.self_attn.q_proj.weight.data.shape == torch.Size(
+            (config.hidden_size, config.hidden_size)
+        ), f"Shape for data2vec_layer.self_attn.q_proj.weight.data should be {torch.Size((config.hidden_size, config.hidden_size))}"
+        assert data2vec_layer.self_attn.v_proj.weight.data.shape == torch.Size(
+            (config.hidden_size, config.hidden_size)
+        ), f"Shape for data2vec_layer.self_attn.v_proj.weight.data should be {torch.Size((config.hidden_size, config.hidden_size))}"
 
         self_attn.query.weight.data = data2vec_layer.self_attn.q_proj.weight
         self_attn.query.bias.data = data2vec_layer.self_attn.q_proj.bias
@@ -111,7 +114,9 @@ def convert_data2vec_checkpoint_to_pytorch(
 
         # self-attention output
         self_output: BertSelfOutput = layer.attention.output
-        assert self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape
+        assert (
+            self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape
+        ), f"Shape for self_output.dense.weight should be {data2vec_layer.self_attn.out_proj.weight.shape}"
         self_output.dense.weight = data2vec_layer.self_attn.out_proj.weight
         self_output.dense.bias = data2vec_layer.self_attn.out_proj.bias
         self_output.LayerNorm.weight = data2vec_layer.self_attn_layer_norm.weight
@@ -119,13 +124,17 @@ def convert_data2vec_checkpoint_to_pytorch(
 
         # intermediate
         intermediate: BertIntermediate = layer.intermediate
-        assert intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape
+        assert (
+            intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape
+        ), f"Shape for intermediate.dense.weight should be {data2vec_layer.fc1.weight.shape}"
         intermediate.dense.weight = data2vec_layer.fc1.weight
         intermediate.dense.bias = data2vec_layer.fc1.bias
 
         # output
         bert_output: BertOutput = layer.output
-        assert bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape
+        assert (
+            bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape
+        ), f"Shape for bert_output.dense.weight should be {data2vec_layer.fc2.weight.shape}"
         bert_output.dense.weight = data2vec_layer.fc2.weight
         bert_output.dense.bias = data2vec_layer.fc2.bias
         bert_output.LayerNorm.weight = data2vec_layer.final_layer_norm.weight

From c5d373656ba792a4106ae1f8a59103614e00d01f Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Wed, 9 Feb 2022 00:38:25 -0500
Subject: [PATCH 086/121] Rename Data2VecModel to Data2VecForTextModel

---
 docs/source/model_doc/data2vec.mdx            |  4 +--
 src/transformers/__init__.py                  |  4 +--
 src/transformers/models/auto/modeling_auto.py |  2 +-
 src/transformers/models/data2vec/__init__.py  |  4 +--
 .../models/data2vec/configuration_data2vec.py | 13 +++----
 .../models/data2vec/modeling_data2vec.py      | 23 +++++-------
 src/transformers/utils/dummy_pt_objects.py    |  4 +--
 tests/test_modeling_data2vec.py               | 35 ++++---------------
 8 files changed, 30 insertions(+), 59 deletions(-)

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
index 21ced12bd836f..619c186edfbd9 100644
--- a/docs/source/model_doc/data2vec.mdx
+++ b/docs/source/model_doc/data2vec.mdx
@@ -44,9 +44,9 @@ The original code can be found [here](https://github.com/pytorch/fairseq/tree/ma
 
 [[autodoc]] Data2VecConfig
 
-## Data2VecModel
+## Data2VecForTextModel
 
-[[autodoc]] Data2VecModel
+[[autodoc]] Data2VecForTextModel
     - forward
 
 ## Data2VecForCausalLM
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 326739e00ec09..c7980f02cdcc3 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -880,8 +880,8 @@
             "Data2VecForMultipleChoice",
             "Data2VecForQuestionAnswering",
             "Data2VecForSequenceClassification",
+            "Data2VecForTextModel",
             "Data2VecForTokenClassification",
-            "Data2VecModel",
             "Data2VecPreTrainedModel",
         ]
     )
@@ -3035,8 +3035,8 @@
             Data2VecForMultipleChoice,
             Data2VecForQuestionAnswering,
             Data2VecForSequenceClassification,
+            Data2VecForTextModel,
             Data2VecForTokenClassification,
-            Data2VecModel,
             Data2VecPreTrainedModel,
         )
         from .models.deberta import (
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index e9cfe4949606e..772aa885f4d98 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -83,7 +83,7 @@
         ("bart", "BartModel"),
         ("longformer", "LongformerModel"),
         ("roberta", "RobertaModel"),
-        ("data2vec", "Data2VecModel"),
+        ("data2vec", "Data2VecForTextModel"),
         ("layoutlm", "LayoutLMModel"),
         ("squeezebert", "SqueezeBertModel"),
         ("bert", "BertModel"),
diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
index 312a1bf2db7f0..62a37ff78ba9f 100644
--- a/src/transformers/models/data2vec/__init__.py
+++ b/src/transformers/models/data2vec/__init__.py
@@ -33,8 +33,8 @@
         "Data2VecForMultipleChoice",
         "Data2VecForQuestionAnswering",
         "Data2VecForSequenceClassification",
+        "Data2VecForTextModel",
         "Data2VecForTokenClassification",
-        "Data2VecModel",
         "Data2VecPreTrainedModel",
     ]
 
@@ -49,8 +49,8 @@
             Data2VecForMultipleChoice,
             Data2VecForQuestionAnswering,
             Data2VecForSequenceClassification,
+            Data2VecForTextModel,
             Data2VecForTokenClassification,
-            Data2VecModel,
             Data2VecPreTrainedModel,
         )
 
diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index ae56227dd921b..32cb944aa243d 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -30,10 +30,10 @@
 
 class Data2VecConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Data2VecModel`], identical to [`BertConfig]. It
-    is used to instantiate a Data2Vec model according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2Vec
-    [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
+    This is the configuration class to store the configuration of a [`Data2VecForTextModel`], identical to
+    [`BertConfig]. It is used to instantiate a Data2Vec model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    Data2Vec [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -82,18 +82,19 @@ class Data2VecConfig(PretrainedConfig):
     Examples:
 
     ```python
-    >>> from transformers import Data2VecModel, Data2VecConfig
+    >>> from transformers import Data2VecForTextModel, Data2VecConfig
 
     >>> # Initializing a Data2Vec edugp/data2vec-nlp-base style configuration
     >>> configuration = Data2VecConfig()
 
     >>> # Initializing a model from the edugp/data2vec-nlp-base style configuration
-    >>> model = Data2VecModel(configuration)
+    >>> model = Data2VecForTextModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
     model_type = "data2vec"
+
     def __init__(
         self,
         vocab_size=30522,
diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 37c7354d9432a..d8425fea00236 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -152,7 +152,6 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
         return position_ids.unsqueeze(0).expand(input_shape)
 
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Data2Vec
 class Data2VecSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
@@ -253,7 +252,7 @@ def forward(
 
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in Data2VecModel forward() function)
+            # Apply the attention mask is (precomputed for all layers in Data2VecForTextModel forward() function)
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
@@ -689,7 +688,7 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
     DATA2VEC_START_DOCSTRING,
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaModel with ROBERTA->DATA2VEC,Roberta->Data2Vec
-class Data2VecModel(Data2VecPreTrainedModel):
+class Data2VecForTextModel(Data2VecPreTrainedModel):
     """
 
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
@@ -875,7 +874,6 @@ def forward(
 @add_start_docstrings(
     """Data2Vec Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VEC_START_DOCSTRING
 )
-# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec,roberta-base->data2vec
 class Data2VecForCausalLM(Data2VecPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
@@ -887,7 +885,7 @@ def __init__(self, config):
         if not config.is_decoder:
             logger.warning("If you want to use `Data2VecLMHeadModel` as a standalone, add `is_decoder=True.`")
 
-        self.data2vec = Data2VecModel(config, add_pooling_layer=False)
+        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
         self.lm_head = Data2VecLMHead(config)
 
         # The LM head weights require special treatment only when they are tied with the word embeddings
@@ -1028,7 +1026,6 @@ def _reorder_cache(self, past, beam_idx):
 
 
 @add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VEC_START_DOCSTRING)
-# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec
 class Data2VecForMaskedLM(Data2VecPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
@@ -1043,7 +1040,7 @@ def __init__(self, config):
                 "bi-directional self-attention."
             )
 
-        self.data2vec = Data2VecModel(config, add_pooling_layer=False)
+        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
         self.lm_head = Data2VecLMHead(config)
 
         # The LM head weights require special treatment only when they are tied with the word embeddings
@@ -1159,7 +1156,6 @@ def _tie_weights(self):
     """,
     DATA2VEC_START_DOCSTRING,
 )
-# Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec
 class Data2VecForSequenceClassification(Data2VecPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
@@ -1168,7 +1164,7 @@ def __init__(self, config):
         self.num_labels = config.num_labels
         self.config = config
 
-        self.data2vec = Data2VecModel(config, add_pooling_layer=False)
+        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
         self.classifier = Data2VecClassificationHead(config)
 
         # Initialize weights and apply final processing
@@ -1258,14 +1254,13 @@ def forward(
     """,
     DATA2VEC_START_DOCSTRING,
 )
-# Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec
 class Data2VecForMultipleChoice(Data2VecPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def __init__(self, config):
         super().__init__(config)
 
-        self.data2vec = Data2VecModel(config)
+        self.data2vec = Data2VecForTextModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, 1)
 
@@ -1354,7 +1349,6 @@ def forward(
     """,
     DATA2VEC_START_DOCSTRING,
 )
-# Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec
 class Data2VecForTokenClassification(Data2VecPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids"]
@@ -1363,7 +1357,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.data2vec = Data2VecModel(config, add_pooling_layer=False)
+        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
         classifier_dropout = (
             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
         )
@@ -1463,7 +1457,6 @@ def forward(self, features, **kwargs):
     """,
     DATA2VEC_START_DOCSTRING,
 )
-# Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with ROBERTA->DATA2VEC,Roberta->Data2Vec,roberta->data2vec
 class Data2VecForQuestionAnswering(Data2VecPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids"]
@@ -1472,7 +1465,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.data2vec = Data2VecModel(config, add_pooling_layer=False)
+        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
         # Initialize weights and apply final processing
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 58a90261d724a..0ea583b8feb96 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1249,14 +1249,14 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForTokenClassification(metaclass=DummyObject):
+class Data2VecForTextModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecModel(metaclass=DummyObject):
+class Data2VecForTokenClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/test_modeling_data2vec.py b/tests/test_modeling_data2vec.py
index 4a59ba45c3a27..3ff5f25bddcbc 100644
--- a/tests/test_modeling_data2vec.py
+++ b/tests/test_modeling_data2vec.py
@@ -15,7 +15,6 @@
 
 
 import unittest
-from copy import deepcopy
 
 from transformers import Data2VecConfig, is_torch_available
 from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
@@ -34,8 +33,8 @@
         Data2VecForMultipleChoice,
         Data2VecForQuestionAnswering,
         Data2VecForSequenceClassification,
+        Data2VecForTextModel,
         Data2VecForTokenClassification,
-        Data2VecModel,
     )
     from transformers.models.data2vec.modeling_data2vec import (
         DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -43,8 +42,6 @@
         create_position_ids_from_input_ids,
     )
 
-DATA2VEC_TINY = "sshleifer/tiny-distildata2vec"
-
 
 class Data2VecModelTester:
     def __init__(
@@ -142,7 +139,7 @@ def prepare_config_and_inputs_for_decoder(self):
     def create_and_check_model(
         self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
-        model = Data2VecModel(config=config)
+        model = Data2VecForTextModel(config=config)
         model.to(torch_device)
         model.eval()
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
@@ -165,7 +162,7 @@ def create_and_check_model_as_decoder(
         encoder_attention_mask,
     ):
         config.add_cross_attention = True
-        model = Data2VecModel(config)
+        model = Data2VecForTextModel(config)
         model.to(torch_device)
         model.eval()
         result = model(
@@ -346,7 +343,7 @@ class Data2VecModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCa
         (
             Data2VecForCausalLM,
             Data2VecForMaskedLM,
-            Data2VecModel,
+            Data2VecForTextModel,
             Data2VecForSequenceClassification,
             Data2VecForTokenClassification,
             Data2VecForMultipleChoice,
@@ -433,7 +430,7 @@ def test_for_question_answering(self):
     @slow
     def test_model_from_pretrained(self):
         for model_name in DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = Data2VecModel.from_pretrained(model_name)
+            model = Data2VecForTextModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
     def test_create_position_ids_respects_padding_index(self):
@@ -502,7 +499,7 @@ def test_inference_masked_lm(self):
 
     @slow
     def test_inference_no_head(self):
-        model = Data2VecModel.from_pretrained("data2vec")
+        model = Data2VecForTextModel.from_pretrained("data2vec")
 
         input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         with torch.no_grad():
@@ -534,23 +531,3 @@ def test_inference_classification_head(self):
         # expected_tensor = data2vec.predict("mnli", input_ids, return_logits=True).detach()
 
         self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
-
-    # XXX: this might be a candidate for common tests if we have many of those
-    def test_lm_head_ignore_keys(self):
-        keys_to_ignore_on_save_tied = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-        keys_to_ignore_on_save_untied = [r"lm_head.decoder.bias"]
-        config = Data2VecConfig.from_pretrained(DATA2VEC_TINY)
-        config_tied = deepcopy(config)
-        config_tied.tie_word_embeddings = True
-        config_untied = deepcopy(config)
-        config_untied.tie_word_embeddings = False
-        for cls in [Data2VecForMaskedLM, Data2VecForCausalLM]:
-            model = cls(config_tied)
-            self.assertEqual(model._keys_to_ignore_on_save, keys_to_ignore_on_save_tied, cls)
-
-            # the keys should be different when embeddings aren't tied
-            model = cls(config_untied)
-            self.assertEqual(model._keys_to_ignore_on_save, keys_to_ignore_on_save_untied, cls)
-
-            # test that saving works with updated ignore keys - just testing that it doesn't fail
-            model.save_pretrained(self.get_auto_remove_tmp_dir())

From 84c6ad16219af04be73234b7dfde8e3b7336a0b2 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Wed, 9 Feb 2022 00:49:05 -0500
Subject: [PATCH 087/121] Add Data2Vec to _toctree.yml

---
 docs/source/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index baaf6e64c837a..a6afc39257823 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -178,6 +178,8 @@
       title: CPM
     - local: model_doc/ctrl
       title: CTRL
+    - local: model_doc/data2vec
+      title: Data2Vec
     - local: model_doc/deberta
       title: DeBERTa
     - local: model_doc/deberta-v2

From d447c9062f664b704d420ce6950acfb19dbc8a0a Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Wed, 9 Feb 2022 16:36:04 -0500
Subject: [PATCH 088/121] Rename Data2VecEmbeddings to
 Data2VecForTextEmbeddings

---
 src/transformers/models/data2vec/modeling_data2vec.py |  6 ++----
 tests/test_modeling_data2vec.py                       | 10 +++++-----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index d8425fea00236..97aa70b804fa3 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -62,7 +62,7 @@
 
 
 # Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Data2Vec
-class Data2VecEmbeddings(nn.Module):
+class Data2VecForTextEmbeddings(nn.Module):
     """
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
@@ -687,7 +687,6 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
     "The bare Data2Vec Model transformer outputting raw hidden-states without any specific head on top.",
     DATA2VEC_START_DOCSTRING,
 )
-# Copied from transformers.models.roberta.modeling_roberta.RobertaModel with ROBERTA->DATA2VEC,Roberta->Data2Vec
 class Data2VecForTextModel(Data2VecPreTrainedModel):
     """
 
@@ -706,12 +705,11 @@ class Data2VecForTextModel(Data2VecPreTrainedModel):
 
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Data2Vec
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
 
-        self.embeddings = Data2VecEmbeddings(config)
+        self.embeddings = Data2VecForTextEmbeddings(config)
         self.encoder = Data2VecEncoder(config)
 
         self.pooler = Data2VecPooler(config) if add_pooling_layer else None
diff --git a/tests/test_modeling_data2vec.py b/tests/test_modeling_data2vec.py
index 3ff5f25bddcbc..3426ed9b86507 100644
--- a/tests/test_modeling_data2vec.py
+++ b/tests/test_modeling_data2vec.py
@@ -38,7 +38,7 @@
     )
     from transformers.models.data2vec.modeling_data2vec import (
         DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
-        Data2VecEmbeddings,
+        Data2VecForTextEmbeddings,
         create_position_ids_from_input_ids,
     )
 
@@ -438,10 +438,10 @@ def test_create_position_ids_respects_padding_index(self):
         test for https://github.com/huggingface/transformers/issues/1761
 
         The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecEmbeddings.padding_idx + 1
+        first available non-padding position index is Data2VecForTextEmbeddings.padding_idx + 1
         """
         config = self.model_tester.prepare_config_and_inputs()[0]
-        model = Data2VecEmbeddings(config=config)
+        model = Data2VecForTextEmbeddings(config=config)
 
         input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
         expected_positions = torch.as_tensor(
@@ -457,10 +457,10 @@ def test_create_position_ids_from_inputs_embeds(self):
         test for https://github.com/huggingface/transformers/issues/1761
 
         The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecEmbeddings.padding_idx + 1
+        first available non-padding position index is Data2VecForTextEmbeddings.padding_idx + 1
         """
         config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = Data2VecEmbeddings(config=config)
+        embeddings = Data2VecForTextEmbeddings(config=config)
 
         inputs_embeds = torch.empty(2, 4, 30)
         expected_single_positions = [

From 46c0c887fb7bba04e84d221108ca76f7cafef543 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Tue, 15 Feb 2022 22:48:09 -0500
Subject: [PATCH 089/121] Add initial Data2VecForAudio model (unfinished). Only
 matching fairseq's implementation up to the feature encoder (before
 positional encoding).

---
 docs/source/model_doc/data2vec.mdx            |   5 +
 src/transformers/__init__.py                  |   2 +
 src/transformers/models/data2vec/__init__.py  |   2 +
 .../models/data2vec/configuration_data2vec.py |  45 +-
 ..._original_pytorch_checkpoint_to_pytorch.py | 246 +++++++++
 ...original_pytorch_checkpoint_to_pytorch.py} |   0
 .../models/data2vec/data2vec_audio.py         | 481 ++++++++++++++++++
 .../models/data2vec/data2vec_text.py          |   2 +
 .../models/data2vec/ema_module.py             | 133 +++++
 .../models/data2vec/modeling_data2vec.py      |  85 +++-
 src/transformers/utils/dummy_pt_objects.py    |   7 +
 utils/check_repo.py                           |   2 +
 12 files changed, 1005 insertions(+), 5 deletions(-)
 create mode 100644 src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
 rename src/transformers/models/data2vec/{convert_data2vec_original_pytorch_checkpoint_to_pytorch.py => convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py} (100%)
 create mode 100644 src/transformers/models/data2vec/data2vec_audio.py
 create mode 100644 src/transformers/models/data2vec/ema_module.py

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
index 619c186edfbd9..2471ccc883c32 100644
--- a/docs/source/model_doc/data2vec.mdx
+++ b/docs/source/model_doc/data2vec.mdx
@@ -44,6 +44,11 @@ The original code can be found [here](https://github.com/pytorch/fairseq/tree/ma
 
 [[autodoc]] Data2VecConfig
 
+## Data2VecForAudioModel
+
+[[autodoc]] Data2VecForAudioModel
+    - forward
+
 ## Data2VecForTextModel
 
 [[autodoc]] Data2VecForTextModel
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c7980f02cdcc3..baec2791a6042 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -875,6 +875,7 @@
     _import_structure["models.data2vec"].extend(
         [
             "DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "Data2VecForAudioModel",
             "Data2VecForCausalLM",
             "Data2VecForMaskedLM",
             "Data2VecForMultipleChoice",
@@ -3030,6 +3031,7 @@
         )
         from .models.data2vec import (
             DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Data2VecForAudioModel,
             Data2VecForCausalLM,
             Data2VecForMaskedLM,
             Data2VecForMultipleChoice,
diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
index 62a37ff78ba9f..00873fff121d3 100644
--- a/src/transformers/models/data2vec/__init__.py
+++ b/src/transformers/models/data2vec/__init__.py
@@ -28,6 +28,7 @@
 if is_torch_available():
     _import_structure["modeling_data2vec"] = [
         "DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Data2VecForAudioModel",
         "Data2VecForCausalLM",
         "Data2VecForMaskedLM",
         "Data2VecForMultipleChoice",
@@ -44,6 +45,7 @@
     if is_torch_available():
         from .modeling_data2vec import (
             DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Data2VecForAudioModel,
             Data2VecForCausalLM,
             Data2VecForMaskedLM,
             Data2VecForMultipleChoice,
diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index 32cb944aa243d..dd2260a13cacd 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -30,10 +30,10 @@
 
 class Data2VecConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Data2VecForTextModel`], identical to
-    [`BertConfig]. It is used to instantiate a Data2Vec model according to the specified arguments, defining the model
-    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
-    Data2Vec [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
+    This is the configuration class to store the configuration of a [`Data2VecForTextModel`] and
+    [`Data2VecForAudioModel`]. It is used to instantiate a Data2Vec model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the Data2Vec [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -115,6 +115,22 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
+        # Here starts audio-only parameters
+        feat_extract_norm="layer",
+        feat_extract_activation="gelu",
+        num_conv_pos_embeddings=19,
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        feat_proj_dropout=0.0,
+        mask_time_prob=0.05,
+        do_stable_layer_norm=False,
+        num_conv_pos_embedding_groups=16,
+        activation_dropout=0.1,
+        add_adapter=False,
+        final_dropout=0.1,
+        proj_codevector_dim=256,
         **kwargs
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -134,6 +150,27 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
         self.classifier_dropout = classifier_dropout
+        # Here starts audio-only parameters
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.conv_dim = conv_dim
+        self.conv_stride = conv_stride
+        self.conv_kernel = conv_kernel
+        self.conv_bias = conv_bias
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.feat_proj_dropout = feat_proj_dropout
+        self.mask_time_prob = mask_time_prob
+        self.do_stable_layer_norm = do_stable_layer_norm
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        # Using the same dropout as for text
+        self.hidden_dropout = self.hidden_dropout_prob
+        # Using the same dropout as for text
+        self.attention_dropout = self.attention_probs_dropout_prob
+        self.activation_dropout = activation_dropout
+        self.add_adapter = add_adapter
+        self.final_dropout = final_dropout
+        self.proj_codevector_dim = proj_codevector_dim
 
 
 class Data2VecOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000..abf8285674f3e
--- /dev/null
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,246 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Wav2Vec2 checkpoint."""
+
+
+import argparse
+import os
+
+import torch
+from datasets import load_dataset
+
+import soundfile as sf
+from transformers import Wav2Vec2Processor, logging
+
+from ..data2vec import Data2VecConfig, Data2VecForAudioModel
+
+# Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
+from .data2vec_audio import Data2VecAudioModel
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+MAPPING = {
+    "post_extract_proj": "feature_projection.projection",
+    "models.0.layer_norm": "feature_projection.layer_norm",
+    "encoder.pos_conv.0": "pos_conv_embed.conv",
+    "self_attn.k_proj": "encoder.layer.*.attention.self.key",
+    "self_attn.v_proj": "encoder.layer.*.attention.self.value",
+    "self_attn.q_proj": "encoder.layer.*.attention.self.query",
+    "self_attn.out_proj.weight": "encoder.layer.*.attention.output.dense",
+    "self_attn_layer_norm": "encoder.layer.*.attention.output.LayerNorm",
+    "fc1": "encoder.layer.*.intermediate.dense",
+    "fc2": "encoder.layer.*.output.dense",
+    "final_layer_norm": "encoder.layer.*.output.LayerNorm",
+    "encoder.layer_norm": "layer_norm",
+    "w2v_model.layer_norm": "feature_projection.layer_norm",
+    "quantizer.weight_proj": "quantizer.weight_proj",
+    "quantizer.vars": "quantizer.codevectors",
+    "project_q": "project_q",
+    # "final_proj": "project_hid",
+    "w2v_encoder.proj": "lm_head",
+    "mask_emb": "masked_spec_embed",
+}
+TOP_LEVEL_KEYS = [
+    "lm_head",
+    "quantizer.weight_proj",
+    "quantizer.codevectors",
+    "project_q",
+    "project_hid",
+]
+
+
+def set_recursively(hf_pointer, key, value, full_name, weight_type):
+    for attribute in key.split("."):
+        hf_pointer = getattr(hf_pointer, attribute)
+
+    if weight_type is not None:
+        hf_shape = getattr(hf_pointer, weight_type).shape
+    else:
+        hf_shape = hf_pointer.shape
+
+    if hf_shape != value.shape:
+        raise ValueError(
+            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be {value.shape} for {full_name}"
+        )
+
+    if weight_type == "weight":
+        hf_pointer.weight.data = value
+    elif weight_type == "weight_g":
+        hf_pointer.weight_g.data = value
+    elif weight_type == "weight_v":
+        hf_pointer.weight_v.data = value
+    elif weight_type == "bias":
+        hf_pointer.bias.data = value
+    else:
+        hf_pointer.data = value
+
+    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
+
+
+def recursively_load_weights(fairseq_model, hf_model, is_headless):
+    unused_weights = []
+    fairseq_dict = fairseq_model.state_dict()
+
+    feature_extractor = hf_model.feature_extractor
+
+    for name, value in fairseq_dict.items():
+        is_used = False
+        if "conv_layers" in name:
+            load_conv_layer(
+                name,
+                value,
+                feature_extractor,
+                unused_weights,
+                hf_model.config.feat_extract_norm == "group",
+            )
+            is_used = True
+        else:
+            for key, mapped_key in MAPPING.items():
+                # mapped_key = "wav2vec2." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
+                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
+                    is_used = True
+                    if "*" in mapped_key:
+                        layer_index = name.split(key)[0].split(".")[-2]
+                        mapped_key = mapped_key.replace("*", layer_index)
+                    if "weight_g" in name:
+                        weight_type = "weight_g"
+                    elif "weight_v" in name:
+                        weight_type = "weight_v"
+                    elif "bias" in name:
+                        weight_type = "bias"
+                    elif "weight" in name:
+                        # TODO: don't match quantizer.weight_proj
+                        weight_type = "weight"
+                    else:
+                        weight_type = None
+                    set_recursively(hf_model, mapped_key, value, name, weight_type)
+                continue
+        if not is_used:
+            unused_weights.append(name)
+
+    logger.warning(f"Unused weights: {unused_weights}")
+
+
+def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
+    name = full_name.split("conv_layers.")[-1]
+    items = name.split(".")
+    layer_id = int(items[0])
+    type_id = int(items[1])
+
+    if type_id == 0:
+        if "bias" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].conv.bias.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].conv.weight.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
+        if "bias" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+    else:
+        unused_weights.append(full_name)
+
+
+@torch.no_grad()
+def convert_wav2vec2_checkpoint(
+    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
+):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = Data2VecConfig.from_pretrained(config_path)
+    else:
+        config = Data2VecConfig()
+
+    # hf_wav2vec = Wav2Vec2ForPreTraining(config)
+    hf_wav2vec = Data2VecForAudioModel(config)
+
+    # Modify final_proj layer name
+    data2vec_checkpoint_dir = os.path.dirname(checkpoint_path)
+    state_dict = torch.load(checkpoint_path)
+    state_dict["model"]["final_proj.weight"] = state_dict["model"].pop("final_proj.0.weight")
+    state_dict["model"]["final_proj.bias"] = state_dict["model"].pop("final_proj.0.bias")
+    converted_ckpt = os.path.join(data2vec_checkpoint_dir, "converted.pt")
+    torch.save(state_dict, converted_ckpt)
+    model = Data2VecAudioModel.from_pretrained(data2vec_checkpoint_dir, checkpoint_file=converted_ckpt)
+    model = model.eval()
+
+    recursively_load_weights(model, hf_wav2vec, not is_finetuned)
+
+    def map_to_array(batch):
+        speech, _ = sf.read(batch["file"])
+        batch["speech"] = speech
+        return batch
+
+    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    ds = ds.map(map_to_array)
+    input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
+    hf_wav2vec.eval()
+    our_output = hf_wav2vec(input_values)["last_hidden_state"]
+    # our_hidden_states = hf_wav2vec(input_values, output_hidden_states=-True).hidden_states
+    model.eval()
+    their_output = model.models[0](input_values, mask=False, features_only=True)["layer_results"][-1][0].transpose(
+        0, 1
+    )
+    # their_hidden_states = model.models[0](input_values, mask=False, features_only=True)["layer_results"]
+
+    print(our_output.shape, their_output.shape)
+    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
+    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
+    success = torch.allclose(our_output, their_output, atol=1e-3)
+    print("Do both models output the same tensors?", "🔥" if success else "💩")
+    if not success:
+        raise Exception("Something went wRoNg")
+
+    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
+    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument(
+        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
+    )
+    args = parser.parse_args()
+    convert_wav2vec2_checkpoint(
+        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
+    )
diff --git a/src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
similarity index 100%
rename from src/transformers/models/data2vec/convert_data2vec_original_pytorch_checkpoint_to_pytorch.py
rename to src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
diff --git a/src/transformers/models/data2vec/data2vec_audio.py b/src/transformers/models/data2vec/data2vec_audio.py
new file mode 100644
index 0000000000000..c9cee07c2671a
--- /dev/null
+++ b/src/transformers/models/data2vec/data2vec_audio.py
@@ -0,0 +1,481 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# TODO: Remove file
+
+import logging
+import math
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq.data.data_utils import compute_mask_indices
+from fairseq.models import BaseFairseqModel, register_model
+from fairseq.models.wav2vec import ConvFeatureExtractionModel, TransformerEncoder, Wav2Vec2Config
+from fairseq.modules import GradMultiply, LayerNorm
+from fairseq.utils import index_put
+
+from omegaconf import II
+
+# Copied from https://github.com/pytorch/fairseq/blob/327cff24a57c2ae06657731bf3be86ee88fccfea/fairseq/modules/ema_module.py
+from .ema_module import EMAModule, EMAModuleConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Data2VecAudioConfig(Wav2Vec2Config):
+
+    loss_beta: float = field(default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"})
+    loss_scale: Optional[float] = field(
+        default=None,
+        metadata={"help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"},
+    )
+    average_top_k_layers: int = field(default=8, metadata={"help": "how many layers to average"})
+
+    layer_norm_target_layer: bool = False
+    instance_norm_target_layer: bool = False
+    instance_norm_targets: bool = False
+    layer_norm_targets: bool = False
+    batch_norm_target_layer: bool = False
+    group_norm_target_layer: bool = False
+
+    ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"})
+    ema_end_decay: float = field(default=0.9999, metadata={"help": "final ema decay rate"})
+
+    # when to finish annealing ema decay rate
+    ema_anneal_end_step: int = II("optimization.max_update")
+
+    ema_transformer_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer"},
+    )
+    ema_layers_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer layers"},
+    )
+
+    max_update: int = II("optimization.max_update")
+
+    min_target_var: float = field(default=0.1, metadata={"help": "stop training if target var falls below this"})
+    min_pred_var: float = field(
+        default=0.01,
+        metadata={"help": "stop training if prediction var falls below this"},
+    )
+
+
+def get_annealed_rate(start, end, curr_step, total_steps):
+    r = end - start
+    pct_remaining = 1 - curr_step / total_steps
+    return end - r * pct_remaining
+
+
+@register_model("data2vec_audio", dataclass=Data2VecAudioConfig)
+class Data2VecAudioModel(BaseFairseqModel):
+    def __init__(self, cfg: Data2VecAudioConfig):
+        super().__init__()
+        self.cfg = cfg
+
+        feature_enc_layers = eval(cfg.conv_feature_layers)
+        self.extractor_embed = feature_enc_layers[-1][0]
+
+        self.ema = None
+        self.embed = cfg.encoder_embed_dim
+
+        self.average_top_k_layers = cfg.average_top_k_layers
+        self.loss_beta = cfg.loss_beta
+        self.loss_scale = cfg.loss_scale
+
+        self.feature_extractor = ConvFeatureExtractionModel(
+            conv_layers=feature_enc_layers,
+            dropout=0.0,
+            mode=cfg.extractor_mode,
+            conv_bias=cfg.conv_bias,
+        )
+
+        self.post_extract_proj = nn.Linear(self.extractor_embed, cfg.encoder_embed_dim)
+
+        self.mask_prob = cfg.mask_prob
+        self.mask_selection = cfg.mask_selection
+        self.mask_other = cfg.mask_other
+        self.mask_length = cfg.mask_length
+        self.no_mask_overlap = cfg.no_mask_overlap
+        self.mask_min_space = cfg.mask_min_space
+
+        self.mask_channel_prob = cfg.mask_channel_prob
+        self.mask_channel_before = cfg.mask_channel_before
+        self.mask_channel_selection = cfg.mask_channel_selection
+        self.mask_channel_other = cfg.mask_channel_other
+        self.mask_channel_length = cfg.mask_channel_length
+        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
+        self.mask_channel_min_space = cfg.mask_channel_min_space
+
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        self.dropout_features = nn.Dropout(cfg.dropout_features)
+
+        self.feature_grad_mult = cfg.feature_grad_mult
+
+        self.mask_emb = nn.Parameter(torch.FloatTensor(cfg.encoder_embed_dim).uniform_())
+
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.extractor_embed)
+
+        self.final_proj = nn.Linear(self.embed, self.embed)
+
+        self.num_updates = 0
+
+    def make_ema_teacher(self):
+        ema_config = EMAModuleConfig(
+            ema_decay=self.cfg.ema_decay,
+            ema_fp32=True,
+        )
+        skip_keys = set()
+        if self.cfg.ema_layers_only:
+            self.cfg.ema_transformer_only = True
+            for k, _ in self.encoder.pos_conv.named_parameters():
+                skip_keys.add(f"pos_conv.{k}")
+
+        self.ema = EMAModule(
+            self.encoder if self.cfg.ema_transformer_only else self,
+            ema_config,
+            skip_keys=skip_keys,
+        )
+
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+
+        if self.ema is None and self.final_proj is not None:
+            logger.info("making ema teacher")
+            self.make_ema_teacher()
+        elif self.training and self.ema is not None:
+            if self.cfg.ema_decay != self.cfg.ema_end_decay:
+                if num_updates >= self.cfg.ema_anneal_end_step:
+                    decay = self.cfg.ema_end_decay
+                else:
+                    decay = get_annealed_rate(
+                        self.cfg.ema_decay,
+                        self.cfg.ema_end_decay,
+                        num_updates,
+                        self.cfg.ema_anneal_end_step,
+                    )
+                self.ema.set_decay(decay)
+            if self.ema.get_decay() < 1:
+                self.ema.step(self.encoder if self.cfg.ema_transformer_only else self)
+
+        self.num_updates = num_updates
+
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        state = super().state_dict(destination, prefix, keep_vars)
+
+        if self.ema is not None:
+            state[prefix + "_ema"] = self.ema.fp32_params
+
+        return state
+
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        if self.ema is not None:
+            k = prefix + "_ema"
+            assert k in state_dict
+            self.ema.restore(state_dict[k], True)
+            del state_dict[k]
+        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+    @classmethod
+    def build_model(cls, cfg: Data2VecAudioConfig, task=None):
+        """Build a new model instance."""
+
+        return cls(cfg)
+
+    def apply_mask(
+        self,
+        x,
+        padding_mask,
+        mask_indices=None,
+        mask_channel_indices=None,
+    ):
+        B, T, C = x.shape
+
+        if self.mask_channel_prob > 0 and self.mask_channel_before:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space,
+            )
+            mask_channel_indices = torch.from_numpy(mask_channel_indices).to(x.device).unsqueeze(1).expand(-1, T, -1)
+            x[mask_channel_indices] = 0
+
+        if self.mask_prob > 0:
+            if mask_indices is None:
+                mask_indices = compute_mask_indices(
+                    (B, T),
+                    padding_mask,
+                    self.mask_prob,
+                    self.mask_length,
+                    self.mask_selection,
+                    self.mask_other,
+                    min_masks=1,
+                    no_overlap=self.no_mask_overlap,
+                    min_space=self.mask_min_space,
+                    require_same_masks=self.cfg.require_same_masks,
+                    mask_dropout=self.cfg.mask_dropout,
+                )
+                mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            x = index_put(x, mask_indices, self.mask_emb)
+        else:
+            mask_indices = None
+
+        if self.mask_channel_prob > 0 and not self.mask_channel_before:
+            if mask_channel_indices is None:
+                mask_channel_indices = compute_mask_indices(
+                    (B, C),
+                    None,
+                    self.mask_channel_prob,
+                    self.mask_channel_length,
+                    self.mask_channel_selection,
+                    self.mask_channel_other,
+                    no_overlap=self.no_mask_channel_overlap,
+                    min_space=self.mask_channel_min_space,
+                )
+                mask_channel_indices = (
+                    torch.from_numpy(mask_channel_indices).to(x.device).unsqueeze(1).expand(-1, T, -1)
+                )
+            x = index_put(x, mask_channel_indices, 0)
+
+        return x, mask_indices
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            return torch.floor((input_length - kernel_size) / stride + 1)
+
+        conv_cfg_list = eval(self.cfg.conv_feature_layers)
+
+        for i in range(len(conv_cfg_list)):
+            input_lengths = _conv_out_length(input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2])
+
+        return input_lengths.to(torch.long)
+
+    def forward(
+        self,
+        source,
+        padding_mask=None,
+        mask=True,
+        features_only=False,
+        layer=None,
+        mask_indices=None,
+        mask_channel_indices=None,
+        padding_count=None,
+    ):
+        features = source
+
+        if self.feature_grad_mult > 0:
+            features = self.feature_extractor(features)
+            if self.feature_grad_mult != 1.0:
+                features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            with torch.no_grad():
+                features = self.feature_extractor(features)
+
+        features = features.transpose(1, 2)
+
+        features = self.layer_norm(features)
+
+        orig_padding_mask = padding_mask
+
+        if padding_mask is not None and padding_mask.any():
+            input_lengths = (1 - padding_mask.long()).sum(-1)
+            # apply conv formula to get real output_lengths
+            output_lengths = self._get_feat_extract_output_lengths(input_lengths)
+
+            padding_mask = torch.zeros(features.shape[:2], dtype=features.dtype, device=features.device)
+
+            # these two operations makes sure that all values
+            # before the output lengths indices are attended to
+            padding_mask[
+                (
+                    torch.arange(padding_mask.shape[0], device=padding_mask.device),
+                    output_lengths - 1,
+                )
+            ] = 1
+            padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool()
+        else:
+            padding_mask = None
+
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+
+        pre_encoder_features = None
+        if self.cfg.ema_transformer_only:
+            pre_encoder_features = features.clone()
+
+        features = self.dropout_input(features)
+
+        if mask:
+            x, mask_indices = self.apply_mask(
+                features,
+                padding_mask,
+                mask_indices=mask_indices,
+                mask_channel_indices=mask_channel_indices,
+            )
+        else:
+            x = features
+            mask_indices = None
+        print(f"Before encoder FB: {x.shape} {x}")
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask,
+            layer=layer,
+        )
+
+        if features_only:
+            return {
+                "x": x,
+                "padding_mask": padding_mask,
+                "layer_results": layer_results,
+            }
+
+        result = {
+            "losses": {},
+        }
+
+        with torch.no_grad():
+            self.ema.model.eval()
+
+            if self.cfg.ema_transformer_only:
+                y, layer_results = self.ema.model.extract_features(
+                    pre_encoder_features,
+                    padding_mask=padding_mask,
+                    min_layer=self.cfg.encoder_layers - self.average_top_k_layers,
+                )
+                y = {
+                    "x": y,
+                    "padding_mask": padding_mask,
+                    "layer_results": layer_results,
+                }
+            else:
+                y = self.ema.model.extract_features(
+                    source=source,
+                    padding_mask=orig_padding_mask,
+                    mask=False,
+                )
+
+            target_layer_results = [l[2] for l in y["layer_results"]]
+
+            permuted = False
+            if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer:
+                target_layer_results = [tl.permute(1, 2, 0) for tl in target_layer_results]  # TBC -> BCT
+                permuted = True
+
+            if self.cfg.batch_norm_target_layer:
+                target_layer_results = [
+                    F.batch_norm(tl.float(), running_mean=None, running_var=None, training=True)
+                    for tl in target_layer_results
+                ]
+
+            if self.cfg.instance_norm_target_layer:
+                target_layer_results = [F.instance_norm(tl.float()) for tl in target_layer_results]
+
+            if permuted:
+                target_layer_results = [tl.transpose(1, 2) for tl in target_layer_results]  # BCT -> BTC
+
+            if self.cfg.group_norm_target_layer:
+                target_layer_results = [F.layer_norm(tl.float(), tl.shape[-2:]) for tl in target_layer_results]
+
+            if self.cfg.layer_norm_target_layer:
+                target_layer_results = [F.layer_norm(tl.float(), tl.shape[-1:]) for tl in target_layer_results]
+
+            y = sum(target_layer_results) / len(target_layer_results)
+
+            if self.cfg.layer_norm_targets:
+                y = F.layer_norm(y.float(), y.shape[-1:])
+
+            if self.cfg.instance_norm_targets:
+                y = F.instance_norm(y.float().transpose(1, 2)).transpose(1, 2)
+
+            if not permuted:
+                y = y.transpose(0, 1)
+
+            y = y[mask_indices]
+
+        x = x[mask_indices]
+        x = self.final_proj(x)
+
+        sz = x.size(-1)
+
+        if self.loss_beta == 0:
+            loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1)
+        else:
+            loss = F.smooth_l1_loss(x.float(), y.float(), reduction="none", beta=self.loss_beta).sum(dim=-1)
+
+        if self.loss_scale is not None:
+            scale = self.loss_scale
+        else:
+            scale = 1 / math.sqrt(sz)
+
+        result["losses"]["regression"] = loss.sum() * scale
+
+        if "sample_size" not in result:
+            result["sample_size"] = loss.numel()
+
+        with torch.no_grad():
+            result["target_var"] = self.compute_var(y)
+            result["pred_var"] = self.compute_var(x.float())
+
+        if self.num_updates > 5000 and result["target_var"] < self.cfg.min_target_var:
+            logger.error(f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting")
+            raise Exception(f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting")
+        if self.num_updates > 5000 and result["pred_var"] < self.cfg.min_pred_var:
+            logger.error(f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting")
+            raise Exception(f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting")
+
+        if self.ema is not None:
+            result["ema_decay"] = self.ema.get_decay() * 1000
+
+        return result
+
+    @staticmethod
+    def compute_var(y):
+        y = y.view(-1, y.size(-1))
+        if dist.is_initialized():
+            zc = torch.tensor(y.size(0)).cuda()
+            zs = y.sum(dim=0)
+            zss = (y ** 2).sum(dim=0)
+
+            dist.all_reduce(zc)
+            dist.all_reduce(zs)
+            dist.all_reduce(zss)
+
+            var = zss / (zc - 1) - (zs ** 2) / (zc * (zc - 1))
+            return torch.sqrt(var + 1e-6).mean()
+        else:
+            return torch.sqrt(y.var(dim=0) + 1e-6).mean()
+
+    def extract_features(self, source, padding_mask, mask=False, layer=None):
+        res = self.forward(
+            source,
+            padding_mask,
+            mask=mask,
+            features_only=True,
+            layer=layer,
+        )
+        return res
+
+    def remove_pretraining_modules(self, last_layer=None):
+        self.final_proj = None
+        self.ema = None
+        if last_layer is not None:
+            self.encoder.layers = nn.ModuleList(l for i, l in enumerate(self.encoder.layers) if i <= last_layer)
diff --git a/src/transformers/models/data2vec/data2vec_text.py b/src/transformers/models/data2vec/data2vec_text.py
index d8cef5f342224..b55dbddd733dd 100644
--- a/src/transformers/models/data2vec/data2vec_text.py
+++ b/src/transformers/models/data2vec/data2vec_text.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+# TODO: Remove file
+
 import logging
 import math
 from dataclasses import dataclass, field
diff --git a/src/transformers/models/data2vec/ema_module.py b/src/transformers/models/data2vec/ema_module.py
new file mode 100644
index 0000000000000..98ceeb4e83735
--- /dev/null
+++ b/src/transformers/models/data2vec/ema_module.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+
+# TODO: Remove file
+
+"""
+Used for EMA tracking a given pytorch module. The user is responsible for calling step() and setting the appropriate
+decay
+"""
+
+import copy
+import logging
+from dataclasses import dataclass, field
+
+import torch
+from fairseq.dataclass import FairseqDataclass
+
+
+@dataclass
+class EMAModuleConfig(FairseqDataclass):
+    ema_decay: float = field(default=0.9999, metadata={"help": "decay for exponential moving average model"})
+    ema_fp32: bool = field(
+        default=False,
+        metadata={"help": "If true, store EMA model in fp32 even if model is in fp16"},
+    )
+
+
+class EMAModule:
+    """Exponential Moving Average of Fairseq Models"""
+
+    def __init__(self, model, config: EMAModuleConfig, device=None, skip_keys=None):
+        """
+        @param model model to initialize the EMA with @param config EMAConfig object with configuration like ema_decay,
+        ema_update_freq, ema_fp32 @param device If provided, copy EMA to this device (e.g. gpu). Otherwise EMA is in
+        the same device as the model.
+        """
+
+        self.decay = config.ema_decay
+        self.model = copy.deepcopy(model)
+        self.model.requires_grad_(False)
+        self.config = config
+        self.skip_keys = skip_keys or set()
+        self.fp32_params = {}
+
+        if device is not None:
+            logging.info(f"Copying EMA model to device {device}")
+            self.model = self.model.to(device=device)
+
+        if self.config.ema_fp32:
+            self.build_fp32_params()
+
+        self.update_freq_counter = 0
+
+    def build_fp32_params(self, state_dict=None):
+        """
+        Store a copy of the EMA params in fp32. If state dict is passed, the EMA params is copied from the provided
+        state dict. Otherwise, it is copied from the current EMA model parameters.
+        """
+        if not self.config.ema_fp32:
+            raise RuntimeError(
+                "build_fp32_params should not be called if ema_fp32=False. "
+                "Use ema_fp32=True if this is really intended."
+            )
+
+        if state_dict is None:
+            state_dict = self.model.state_dict()
+
+        def _to_float(t):
+            return t.float() if torch.is_floating_point(t) else t
+
+        for param_key in state_dict:
+            if param_key in self.fp32_params:
+                self.fp32_params[param_key].copy_(state_dict[param_key])
+            else:
+                self.fp32_params[param_key] = _to_float(state_dict[param_key])
+
+    def restore(self, state_dict, build_fp32_params=False):
+        """Load data from a model spec into EMA model"""
+        self.model.load_state_dict(state_dict, strict=False)
+        if build_fp32_params:
+            self.build_fp32_params(state_dict)
+
+    def set_decay(self, decay):
+        self.decay = decay
+
+    def get_decay(self):
+        return self.decay
+
+    def _step_internal(self, new_model):
+        """One update of the EMA model based on new model weights"""
+        decay = self.decay
+
+        ema_state_dict = {}
+        ema_params = self.fp32_params if self.config.ema_fp32 else self.model.state_dict()
+        for key, param in new_model.state_dict().items():
+            if isinstance(param, dict):
+                continue
+            try:
+                ema_param = ema_params[key]
+            except KeyError:
+                ema_param = param.float().clone() if param.ndim == 1 else copy.deepcopy(param)
+
+            if param.shape != ema_param.shape:
+                raise ValueError(
+                    "incompatible tensor shapes between model param and ema param"
+                    + "{} vs. {}".format(param.shape, ema_param.shape)
+                )
+
+            if "version" in key:
+                # Do not decay a model.version pytorch param
+                continue
+
+            if key in self.skip_keys:
+                ema_param = param.to(dtype=ema_param.dtype).clone()
+                ema_params[key].copy_(ema_param)
+            else:
+                ema_param.mul_(decay)
+                ema_param.add_(param.to(dtype=ema_param.dtype), alpha=1 - decay)
+            ema_state_dict[key] = ema_param
+        self.restore(ema_state_dict, build_fp32_params=False)
+
+    def step(self, new_model):
+        self._step_internal(new_model)
+
+    def reverse(self, model):
+        """
+        Load the model parameters from EMA model. Useful for inference or fine-tuning from the EMA model.
+        """
+        d = self.model.state_dict()
+        if "_ema" in d:
+            del d["_ema"]
+
+        model.load_state_dict(d, strict=False)
+        return model
diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 97aa70b804fa3..2e53b915a65a9 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -45,6 +45,13 @@
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
+from ...models.wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Adapter,
+    Wav2Vec2BaseModelOutput,
+    Wav2Vec2FeatureEncoder,
+    Wav2Vec2FeatureProjection,
+    Wav2Vec2PositionalConvEmbedding,
+)
 from ...utils import logging
 from .configuration_data2vec import Data2VecConfig
 
@@ -684,7 +691,83 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
 
 
 @add_start_docstrings(
-    "The bare Data2Vec Model transformer outputting raw hidden-states without any specific head on top.",
+    "The bare Data2Vec Model for audio transformer outputting raw hidden-states without any specific head on top.",
+    DATA2VEC_START_DOCSTRING,
+)
+class Data2VecForAudioModel(Data2VecPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = Wav2Vec2FeatureEncoder(config)
+        self.feature_projection = Wav2Vec2FeatureProjection(config)
+        self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
+
+        self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None
+
+        self.encoder = Data2VecEncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        print(f"Before positional encoding HF:\n {hidden_states.shape} {hidden_states}")
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return Wav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare Data2Vec Model for text transformer outputting raw hidden-states without any specific head on top.",
     DATA2VEC_START_DOCSTRING,
 )
 class Data2VecForTextModel(Data2VecPreTrainedModel):
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 0ea583b8feb96..97920e04e7f67 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1214,6 +1214,13 @@ def __init__(self, *args, **kwargs):
 DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
+class Data2VecForAudioModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class Data2VecForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index b4a7d99a26e08..798273fe7c572 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -88,6 +88,7 @@
     "TFRobertaForMultipleChoice",  # TODO: fix
     "TrOCRDecoderWrapper",  # Building part of bigger (tested) model.
     "SeparableConv1D",  # Building part of bigger (tested) model.
+    "Data2VecForAudioModel",  # TODO: Add tests
 ]
 
 # Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't
@@ -168,6 +169,7 @@
     "VisualBertForMultipleChoice",
     "TFWav2Vec2ForCTC",
     "TFHubertForCTC",
+    "Data2VecForAudioModel",  # TODO: Fix
 ]
 
 # This is to make sure the transformers module imported is the one in the repo.

From 960fe56dd85921fd4ec6f0512a1d49daaa55df29 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 18 Feb 2022 11:40:28 +0000
Subject: [PATCH 090/121] finish audio model

---
 .../models/data2vec/configuration_data2vec.py |   7 +-
 ..._original_pytorch_checkpoint_to_pytorch.py | 204 ++++++----
 .../models/data2vec/data2vec_audio.py         |   1 -
 .../models/data2vec/modeling_data2vec.py      | 354 +++++++++++++++++-
 4 files changed, 466 insertions(+), 100 deletions(-)

diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index dd2260a13cacd..2d423b61d6fe1 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -116,9 +116,9 @@ def __init__(
         use_cache=True,
         classifier_dropout=None,
         # Here starts audio-only parameters
+        layer_norm_eps_audio=1e-5,
         feat_extract_norm="layer",
         feat_extract_activation="gelu",
-        num_conv_pos_embeddings=19,
         conv_dim=(512, 512, 512, 512, 512, 512, 512),
         conv_stride=(5, 2, 2, 2, 2, 2, 2),
         conv_kernel=(10, 3, 3, 3, 3, 2, 2),
@@ -127,6 +127,8 @@ def __init__(
         mask_time_prob=0.05,
         do_stable_layer_norm=False,
         num_conv_pos_embedding_groups=16,
+        conv_pos_kernel_size=19,
+        num_conv_pos_embeddings=5,
         activation_dropout=0.1,
         add_adapter=False,
         final_dropout=0.1,
@@ -153,7 +155,6 @@ def __init__(
         # Here starts audio-only parameters
         self.feat_extract_norm = feat_extract_norm
         self.feat_extract_activation = feat_extract_activation
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
         self.conv_dim = conv_dim
         self.conv_stride = conv_stride
         self.conv_kernel = conv_kernel
@@ -163,6 +164,8 @@ def __init__(
         self.mask_time_prob = mask_time_prob
         self.do_stable_layer_norm = do_stable_layer_norm
         self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.conv_pos_kernel_size = conv_pos_kernel_size
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
         # Using the same dropout as for text
         self.hidden_dropout = self.hidden_dropout_prob
         # Using the same dropout as for text
diff --git a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
index abf8285674f3e..f039e8f89c708 100644
--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@@ -19,15 +19,17 @@
 import os
 
 import torch
-from datasets import load_dataset
+from functools import reduce
 
-import soundfile as sf
 from transformers import Wav2Vec2Processor, logging
+import fairseq
+from datasets import load_dataset
 
-from ..data2vec import Data2VecConfig, Data2VecForAudioModel
+from transformers import Data2VecConfig, Data2VecForAudioModel
+from transformers.models.data2vec.modeling_data2vec import Data2VecForCTC
 
 # Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
-from .data2vec_audio import Data2VecAudioModel
+from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel  # noqa: F401
 
 
 logging.set_verbosity_info()
@@ -36,30 +38,21 @@
 MAPPING = {
     "post_extract_proj": "feature_projection.projection",
     "models.0.layer_norm": "feature_projection.layer_norm",
-    "encoder.pos_conv.0": "pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layer.*.attention.self.key",
-    "self_attn.v_proj": "encoder.layer.*.attention.self.value",
-    "self_attn.q_proj": "encoder.layer.*.attention.self.query",
-    "self_attn.out_proj.weight": "encoder.layer.*.attention.output.dense",
-    "self_attn_layer_norm": "encoder.layer.*.attention.output.LayerNorm",
-    "fc1": "encoder.layer.*.intermediate.dense",
-    "fc2": "encoder.layer.*.output.dense",
-    "final_layer_norm": "encoder.layer.*.output.LayerNorm",
-    "encoder.layer_norm": "layer_norm",
+    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
+    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
+    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
+    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
+    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
+    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
+    "fc2": "encoder.layers.*.feed_forward.output_dense",
+    "final_layer_norm": "encoder.layers.*.final_layer_norm",
+    "encoder.layer_norm": "encoder.layer_norm",
     "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    # "final_proj": "project_hid",
     "w2v_encoder.proj": "lm_head",
     "mask_emb": "masked_spec_embed",
 }
 TOP_LEVEL_KEYS = [
     "lm_head",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
 ]
 
 
@@ -95,7 +88,13 @@ def recursively_load_weights(fairseq_model, hf_model, is_headless):
     unused_weights = []
     fairseq_dict = fairseq_model.state_dict()
 
-    feature_extractor = hf_model.feature_extractor
+    if not is_headless:
+        feature_extractor = hf_model.data2vec.feature_extractor
+        pos_conv_embedding = hf_model.data2vec.encoder.pos_conv_embed
+
+    else:
+        feature_extractor = hf_model.feature_extractor
+        pos_conv_embedding = hf_model.encoder.pos_conv_embed
 
     for name, value in fairseq_dict.items():
         is_used = False
@@ -105,12 +104,20 @@ def recursively_load_weights(fairseq_model, hf_model, is_headless):
                 value,
                 feature_extractor,
                 unused_weights,
-                hf_model.config.feat_extract_norm == "group",
+            )
+            is_used = True
+        elif "pos_conv" in name:
+            load_pos_conv_layer(
+                name,
+                value,
+                pos_conv_embedding,
+                unused_weights,
             )
             is_used = True
         else:
             for key, mapped_key in MAPPING.items():
-                # mapped_key = "wav2vec2." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
+                if not is_headless:
+                    mapped_key = "data2vec." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
                 if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
                     is_used = True
                     if "*" in mapped_key:
@@ -135,44 +142,55 @@ def recursively_load_weights(fairseq_model, hf_model, is_headless):
     logger.warning(f"Unused weights: {unused_weights}")
 
 
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
+def access_by_string(module, path):
+    names = path.split(".")
+    return reduce(getattr, names, module)
+
+
+def set_weights(full_name, module, fsq_value, hf_weight_path):
+    hf_weight = access_by_string(module, hf_weight_path)
+    hf_value = hf_weight.data
+
+    if fsq_value.shape != hf_value.shape:
+        raise ValueError(
+            f"{full_name} has size {fsq_value.shape}, but {hf_value.shape} was found."
+        )
+    hf_weight.data = fsq_value
+    logger.info(f"{full_name} was correctly initialized from {hf_weight_path}.")
+
+
+def load_conv_layer(full_name, value, feature_extractor, unused_weights):
     name = full_name.split("conv_layers.")[-1]
     items = name.split(".")
     layer_id = int(items[0])
     type_id = int(items[1])
 
+    weight_type = name.split(".")[-1]
     if type_id == 0:
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+        layer_type = "conv"
+    elif type_id == 2:
+        layer_type = "layer_norm"
     else:
         unused_weights.append(full_name)
+        return
+
+    set_weights(full_name, feature_extractor, value, f"conv_layers.{layer_id}.{layer_type}.{weight_type}")
+
+
+def load_pos_conv_layer(full_name, value, pos_conv_embeddings, unused_weights):
+    name = full_name.split("pos_conv.")[-1]
+    items = name.split(".")
+    layer_id = int(items[0])
+    type_id = int(items[1])
+
+    weight_type = name.split(".")[-1]
+    if type_id != 0:
+        unused_weights.append(full_name)
+        return
+    else:
+        layer_type = "conv"
+
+    set_weights(full_name, pos_conv_embeddings, value, f"layers.{layer_id}.{layer_type}.{weight_type}")
 
 
 @torch.no_grad()
@@ -185,40 +203,59 @@ def convert_wav2vec2_checkpoint(
     if config_path is not None:
         config = Data2VecConfig.from_pretrained(config_path)
     else:
-        config = Data2VecConfig()
+        config = Data2VecConfig(layer_norm_eps=1e-5, vocab_size=32)
+
+    if not is_finetuned:
+        # Modify final_proj layer name
+        hf_wav2vec = Data2VecForAudioModel(config)
+        data2vec_checkpoint_dir = os.path.dirname(checkpoint_path)
+
+        state_dict = torch.load(checkpoint_path)
+        state_dict["model"]["final_proj.weight"] = state_dict["model"].pop("final_proj.0.weight")
+        state_dict["model"]["final_proj.bias"] = state_dict["model"].pop("final_proj.0.bias")
+        converted_ckpt = os.path.join(data2vec_checkpoint_dir, "converted.pt")
+        torch.save(state_dict, converted_ckpt)
+    else:
+        hf_wav2vec = Data2VecForCTC(config)
+        converted_ckpt = checkpoint_path
 
-    # hf_wav2vec = Wav2Vec2ForPreTraining(config)
-    hf_wav2vec = Data2VecForAudioModel(config)
+    def load_data2vec(path):
+        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([path])
+        return model[0].eval()
 
-    # Modify final_proj layer name
-    data2vec_checkpoint_dir = os.path.dirname(checkpoint_path)
-    state_dict = torch.load(checkpoint_path)
-    state_dict["model"]["final_proj.weight"] = state_dict["model"].pop("final_proj.0.weight")
-    state_dict["model"]["final_proj.bias"] = state_dict["model"].pop("final_proj.0.bias")
-    converted_ckpt = os.path.join(data2vec_checkpoint_dir, "converted.pt")
-    torch.save(state_dict, converted_ckpt)
-    model = Data2VecAudioModel.from_pretrained(data2vec_checkpoint_dir, checkpoint_file=converted_ckpt)
-    model = model.eval()
+    model = load_data2vec(converted_ckpt)
 
     recursively_load_weights(model, hf_wav2vec, not is_finetuned)
 
-    def map_to_array(batch):
-        speech, _ = sf.read(batch["file"])
-        batch["speech"] = speech
-        return batch
+    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
+
+    ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+    input_audio = [x["array"] for x in ds[:4]["audio"]]
+
+    inputs = processor(input_audio, return_tensors="pt", padding=True)
+
+    input_values = inputs.input_values
+    attention_mask = inputs.attention_mask
+#    input_values = inputs.input_values[:, :-1]
+#    attention_mask = inputs.attention_mask[:, :-1]
 
-    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
-    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-    ds = ds.map(map_to_array)
-    input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
     hf_wav2vec.eval()
-    our_output = hf_wav2vec(input_values)["last_hidden_state"]
-    # our_hidden_states = hf_wav2vec(input_values, output_hidden_states=-True).hidden_states
     model.eval()
-    their_output = model.models[0](input_values, mask=False, features_only=True)["layer_results"][-1][0].transpose(
-        0, 1
-    )
-    # their_hidden_states = model.models[0](input_values, mask=False, features_only=True)["layer_results"]
+    if is_finetuned:
+        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)["encoder_out"].transpose(
+            0, 1
+        )
+        our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["logits"]
+
+        pred_ids = torch.argmax(our_output, dim=-1)
+        output_string = processor.batch_decode(pred_ids)
+
+        print(f"Expected Output: {ds[:4]['text']}, Pred: {output_string}")
+    else:
+        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)["layer_results"][-1][0].transpose(
+            0, 1
+        )
+        our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["last_hidden_state"]
 
     print(our_output.shape, their_output.shape)
     max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
@@ -230,6 +267,11 @@ def map_to_array(batch):
 
     hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
 
+    if is_finetuned:
+        processor.save_pretrained(pytorch_dump_folder_path)
+    else:
+        processor.feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/src/transformers/models/data2vec/data2vec_audio.py b/src/transformers/models/data2vec/data2vec_audio.py
index c9cee07c2671a..a5d91b82186f1 100644
--- a/src/transformers/models/data2vec/data2vec_audio.py
+++ b/src/transformers/models/data2vec/data2vec_audio.py
@@ -334,7 +334,6 @@ def forward(
         else:
             x = features
             mask_indices = None
-        print(f"Before encoder FB: {x.shape} {x}")
         x, layer_results = self.encoder(
             x,
             padding_mask=padding_mask,
diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec.py
index 2e53b915a65a9..a3420d84b195a 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec.py
@@ -16,12 +16,14 @@
 
 import math
 
+from typing import Union, Optional
 import torch
 import torch.utils.checkpoint
 from packaging import version
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
+import numpy as np
 from ...activations import ACT2FN, gelu
 from ...file_utils import (
     add_code_sample_docstrings,
@@ -29,6 +31,7 @@
     add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
+from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -38,6 +41,7 @@
     QuestionAnsweringModelOutput,
     SequenceClassifierOutput,
     TokenClassifierOutput,
+    CausalLMOutput,
 )
 from ...modeling_utils import (
     PreTrainedModel,
@@ -45,12 +49,13 @@
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
+from ...modeling_utils import torch_int_div
+from ...modeling_outputs import BaseModelOutput
 from ...models.wav2vec2.modeling_wav2vec2 import (
     Wav2Vec2Adapter,
     Wav2Vec2BaseModelOutput,
     Wav2Vec2FeatureEncoder,
-    Wav2Vec2FeatureProjection,
-    Wav2Vec2PositionalConvEmbedding,
+    Wav2Vec2EncoderLayer,
 )
 from ...utils import logging
 from .configuration_data2vec import Data2VecConfig
@@ -58,10 +63,21 @@
 
 logger = logging.get_logger(__name__)
 
+
+_HIDDEN_STATES_START_POSITION = 2
+
 _CHECKPOINT_FOR_DOC = "data2vec"
 _CONFIG_FOR_DOC = "Data2VecConfig"
 _TOKENIZER_FOR_DOC = "RobertaTokenizer"
 
+# General docstring
+_CONFIG_FOR_DOC = "Wav2Vec2Config"
+_PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 53.48
+
 DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "data2vec",
     # See all data2vec models at https://huggingface.co/models?filter=data2vec
@@ -582,7 +598,6 @@ def forward(self, hidden_states):
         return pooled_output
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->Data2Vec,roberta->data2vec
 class Data2VecPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -593,7 +608,6 @@ class Data2VecPreTrainedModel(PreTrainedModel):
     base_model_prefix = "data2vec"
     supports_gradient_checkpointing = True
 
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
@@ -607,8 +621,10 @@ def _init_weights(self, module):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
         elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+            if hasattr(module, "weight") and module.weight is not None:
+                module.weight.data.fill_(1.0)
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, Data2VecEncoder):
@@ -690,6 +706,157 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
 """
 
 
+class Data2VecPadLayer(nn.Module):
+    def __init__(self, conv_pos_kernel_size):
+        super().__init__()
+        self.num_pad_remove = 1 if conv_pos_kernel_size % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+class Data2VecPositionalConvLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.conv_pos_kernel_size,
+            padding=config.conv_pos_kernel_size // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        self.padding = Data2VecPadLayer(config.conv_pos_kernel_size)
+        self.activation = ACT2FN[config.feat_extract_activation]
+        # no learnable parameters
+        self.layer_norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Data2VecPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layers = nn.ModuleList([Data2VecPositionalConvLayer(config) for _ in range(config.num_conv_pos_embeddings)])
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Data2VecFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+class Data2VecAudioEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = Data2VecPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states[~attention_mask] = 0.0
+
+            # extend attention_mask
+            attention_mask = (1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = np.random.uniform(0, 1)
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(layer),
+                        hidden_states,
+                        attention_mask,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
 @add_start_docstrings(
     "The bare Data2Vec Model for audio transformer outputting raw hidden-states without any specific head on top.",
     DATA2VEC_START_DOCSTRING,
@@ -699,21 +866,63 @@ def __init__(self, config):
         super().__init__(config)
         self.config = config
         self.feature_extractor = Wav2Vec2FeatureEncoder(config)
-        self.feature_projection = Wav2Vec2FeatureProjection(config)
-        self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
-        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feature_projection = Data2VecFeatureProjection(config)
         self.dropout = nn.Dropout(config.hidden_dropout)
 
         # model only needs masking vector if mask prob is > 0.0
         if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
             self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
 
+        self.encoder = Data2VecAudioEncoder(config)
+
         self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None
 
-        self.encoder = Data2VecEncoder(config)
         # Initialize weights and apply final processing
         self.post_init()
 
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch_int_div(input_length - kernel_size, stride) + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
     @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
@@ -737,13 +946,14 @@ def forward(
 
         extract_features = self.feature_extractor(input_values)
         extract_features = extract_features.transpose(1, 2)
-        hidden_states, extract_features = self.feature_projection(extract_features)
-        print(f"Before positional encoding HF:\n {hidden_states.shape} {hidden_states}")
 
-        position_embeddings = self.pos_conv_embed(hidden_states)
-        hidden_states = hidden_states + position_embeddings
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states = self.dropout(hidden_states)
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
 
         encoder_outputs = self.encoder(
             hidden_states,
@@ -766,6 +976,118 @@ def forward(
         )
 
 
+@add_start_docstrings(
+    """Data2Vec Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    DATA2VEC_START_DOCSTRING,
+)
+class Data2VecForCTC(Data2VecPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.data2vec = Data2VecForAudioModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2.feature_extractor._freeze_parameters()
+
+    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_PROCESSOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        labels=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.data2vec(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+
+            if labels.max() >= self.config.vocab_size:
+                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
 @add_start_docstrings(
     "The bare Data2Vec Model for text transformer outputting raw hidden-states without any specific head on top.",
     DATA2VEC_START_DOCSTRING,

From f88162c0278ddfaa42e645a660962c15df84fcc6 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 18 Feb 2022 18:13:58 +0000
Subject: [PATCH 091/121] finish audio file

---
 src/transformers/models/data2vec/__init__.py  |   62 +-
 .../data2vec/configuration_data2vec_audio.py  |  320 +++
 ...2vec.py => configuration_data2vec_text.py} |   66 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   21 +-
 .../data2vec/modeling_data2vec_audio.py       | 1831 +++++++++++++++++
 ..._data2vec.py => modeling_data2vec_text.py} |  555 +----
 6 files changed, 2293 insertions(+), 562 deletions(-)
 create mode 100644 src/transformers/models/data2vec/configuration_data2vec_audio.py
 rename src/transformers/models/data2vec/{configuration_data2vec.py => configuration_data2vec_text.py} (72%)
 create mode 100755 src/transformers/models/data2vec/modeling_data2vec_audio.py
 rename src/transformers/models/data2vec/{modeling_data2vec.py => modeling_data2vec_text.py} (75%)

diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
index 00873fff121d3..1abf2a9f9efac 100644
--- a/src/transformers/models/data2vec/__init__.py
+++ b/src/transformers/models/data2vec/__init__.py
@@ -22,38 +22,52 @@
 
 
 _import_structure = {
-    "configuration_data2vec": ["DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecConfig", "Data2VecOnnxConfig"],
+    "configuration_data2vec_text": ["DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecTextConfig", "Data2VecTextOnnxConfig"],
+    "configuration_data2vec_audio": ["DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecAudioConfig", "Data2VecAudioOnnxConfig"],
 }
 
 if is_torch_available():
-    _import_structure["modeling_data2vec"] = [
-        "DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "Data2VecForAudioModel",
-        "Data2VecForCausalLM",
-        "Data2VecForMaskedLM",
-        "Data2VecForMultipleChoice",
-        "Data2VecForQuestionAnswering",
-        "Data2VecForSequenceClassification",
-        "Data2VecForTextModel",
-        "Data2VecForTokenClassification",
-        "Data2VecPreTrainedModel",
+    _import_structure["modeling_data2vec_text"] = [
+        "DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Data2VecTextModel",
+        "Data2VecTextForCausalLM",
+        "Data2VecTextForMaskedLM",
+        "Data2VecTextForMultipleChoice",
+        "Data2VecTextForQuestionAnswering",
+        "Data2VecTextForSequenceClassification",
+        "Data2VecTextForTextModel",
+        "Data2VecTextForTokenClassification",
+        "Data2VecTextPreTrainedModel",
+    ]
+    _import_structure["modeling_data2vec_audio"] = [
+        "DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Data2VecAudioModel",
+        "Data2VecAudioPreTrainedModel",
+        "Data2VecAudioForCTC",
     ]
 
 if TYPE_CHECKING:
-    from .configuration_data2vec import DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecConfig, Data2VecOnnxConfig
+    from .configuration_data2vec_text import DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecTextConfig, Data2VecTextOnnxConfig
+    from .configuration_data2vec_audio import DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecAudioConfig, Data2VecAudioOnnxConfig
 
     if is_torch_available():
-        from .modeling_data2vec import (
-            DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
-            Data2VecForAudioModel,
-            Data2VecForCausalLM,
-            Data2VecForMaskedLM,
-            Data2VecForMultipleChoice,
-            Data2VecForQuestionAnswering,
-            Data2VecForSequenceClassification,
-            Data2VecForTextModel,
-            Data2VecForTokenClassification,
-            Data2VecPreTrainedModel,
+        from .modeling_data2vec_text import (
+            DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Data2VecTextForAudioModel,
+            Data2VecTextForCausalLM,
+            Data2VecTextForMaskedLM,
+            Data2VecTextForMultipleChoice,
+            Data2VecTextForQuestionAnswering,
+            Data2VecTextForSequenceClassification,
+            Data2VecTextForTextModel,
+            Data2VecTextForTokenClassification,
+            Data2VecTextPreTrainedModel,
+        )
+        from .modeling_data2vec_audio import (
+            DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Data2VecAudioModel,
+            Data2VecAudioPreTrainedModel,
+            Data2VecAudioForCTC,
         )
 
 else:
diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py
new file mode 100644
index 0000000000000..b2d1856a63b5e
--- /dev/null
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -0,0 +1,320 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Data2VecAudio model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+DATA_2_VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/data2vec-base-960h": "https://huggingface.co/facebook/data2vec-audio-base-960h/resolve/main/config.json",
+    # See all Data2VecAudio models at https://huggingface.co/models?filter=data2vec-audio
+}
+
+
+class Data2VecAudioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Data2VecAudioModel`]. It is used to instantiate an
+    Data2VecAudio model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Data2VecAudio
+    [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32):
+            Vocabulary size of the Data2VecAudio model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`Data2VecAudioModel`] or [`TFData2VecAudioModel`]. Vocabulary size of the
+            model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward
+            method of [`Data2VecAudioModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`Data2VecAudioForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for quantized feature encoder states.
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
+            Number of entries in each quantization codebook (group).
+        num_codevector_groups (`int`, *optional*, defaults to 2):
+            Number of codevector groups for product codevector quantization.
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
+        num_negatives (`int`, *optional*, defaults to 100):
+            Number of negative samples for the contrastive loss.
+        codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the quantized feature vectors.
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the final projection of both the quantized and the transformer features.
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
+            The weight of the codebook diversity loss component.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`Data2VecAudioForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`Data2VecAudioForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`Data2VecAudioForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification.
+        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        add_adapter (`bool`, *optional*, defaults to `False`):
+            Whether a convolutional network should be stacked on top of the Data2VecAudio Encoder. Can be very useful for
+            warm-starting Data2VecAudio for SpeechEncoderDecoder models.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.
+
+    Example:
+
+    ```python
+    >>> from transformers import Data2VecAudioModel, Data2VecAudioConfig
+
+    >>> # Initializing a Data2VecAudio facebook/wav2vec2-base-960h style configuration
+    >>> configuration = Data2VecAudioConfig()
+
+    >>> # Initializing a model from the facebook/wav2vec2-base-960h style configuration
+    >>> model = Data2VecAudioModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "data2vec-audio"
+
+    def __init__(
+        self,
+        vocab_size=32,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        activation_dropout=0.1,
+        attention_dropout=0.1,
+        feat_proj_dropout=0.0,
+        feat_quantizer_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        feat_extract_norm="group",
+        feat_extract_activation="gelu",
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        num_conv_pos_embedding_groups=16,
+        conv_pos_kernel_size=19,
+        num_conv_pos_embeddings=5,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        num_codevectors_per_group=320,
+        num_codevector_groups=2,
+        contrastive_logits_temperature=0.1,
+        num_negatives=100,
+        codevector_dim=256,
+        proj_codevector_dim=256,
+        diversity_loss_weight=0.1,
+        ctc_loss_reduction="sum",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        tdnn_dim=(512, 512, 512, 512, 1500),
+        tdnn_kernel=(5, 3, 3, 1, 1),
+        tdnn_dilation=(1, 2, 3, 1, 1),
+        xvector_output_dim=512,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        add_adapter=False,
+        adapter_kernel_size=3,
+        adapter_stride=2,
+        num_adapter_layers=3,
+        output_hidden_size=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.conv_pos_kernel_size = conv_pos_kernel_size
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. "
+                "It is required that `len(config.conv_dim)` == `len(config.conv_stride)` == `len(config.conv_kernel)`, "
+                f"but is `len(config.conv_dim) = {len(self.conv_dim)}`, `len(config.conv_stride) "
+                f"= {len(self.conv_stride)}`, `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        # parameters for pretraining with codevector quantized representations
+        self.num_codevectors_per_group = num_codevectors_per_group
+        self.num_codevector_groups = num_codevector_groups
+        self.contrastive_logits_temperature = contrastive_logits_temperature
+        self.feat_quantizer_dropout = feat_quantizer_dropout
+        self.num_negatives = num_negatives
+        self.codevector_dim = codevector_dim
+        self.proj_codevector_dim = proj_codevector_dim
+        self.diversity_loss_weight = diversity_loss_weight
+
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+        # adapter
+        self.add_adapter = add_adapter
+        self.adapter_kernel_size = adapter_kernel_size
+        self.adapter_stride = adapter_stride
+        self.num_adapter_layers = num_adapter_layers
+        self.output_hidden_size = output_hidden_size or hidden_size
+
+        # SequenceClassification-specific parameter. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+
+        # XVector-specific parameters. Feel free to ignore for other classes.
+        self.tdnn_dim = list(tdnn_dim)
+        self.tdnn_kernel = list(tdnn_kernel)
+        self.tdnn_dilation = list(tdnn_dilation)
+        self.xvector_output_dim = xvector_output_dim
diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec_text.py
similarity index 72%
rename from src/transformers/models/data2vec/configuration_data2vec.py
rename to src/transformers/models/data2vec/configuration_data2vec_text.py
index 2d423b61d6fe1..a6843c650cf1c 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_text.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Data2Vec configuration"""
+""" Data2VecText configuration"""
 from collections import OrderedDict
 from typing import Mapping
 
@@ -28,12 +28,12 @@
 }
 
 
-class Data2VecConfig(PretrainedConfig):
+class Data2VecTextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Data2VecForTextModel`] and
-    [`Data2VecForAudioModel`]. It is used to instantiate a Data2Vec model according to the specified arguments,
+    This is the configuration class to store the configuration of a [`Data2VecTextForTextModel`] and
+    [`Data2VecTextForAudioModel`]. It is used to instantiate a Data2VecText model according to the specified arguments,
     defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the Data2Vec [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
+    to that of the Data2VecText [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -82,18 +82,18 @@ class Data2VecConfig(PretrainedConfig):
     Examples:
 
     ```python
-    >>> from transformers import Data2VecForTextModel, Data2VecConfig
+    >>> from transformers import Data2VecTextModel, Data2VecTextConfig
 
-    >>> # Initializing a Data2Vec edugp/data2vec-nlp-base style configuration
-    >>> configuration = Data2VecConfig()
+    >>> # Initializing a Data2VecText edugp/data2vec-nlp-base style configuration
+    >>> configuration = Data2VecTextConfig()
 
     >>> # Initializing a model from the edugp/data2vec-nlp-base style configuration
-    >>> model = Data2VecForTextModel(configuration)
+    >>> model = Data2VecTextModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "data2vec"
+    model_type = "data2vec_text"
 
     def __init__(
         self,
@@ -115,24 +115,6 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        # Here starts audio-only parameters
-        layer_norm_eps_audio=1e-5,
-        feat_extract_norm="layer",
-        feat_extract_activation="gelu",
-        conv_dim=(512, 512, 512, 512, 512, 512, 512),
-        conv_stride=(5, 2, 2, 2, 2, 2, 2),
-        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
-        conv_bias=False,
-        feat_proj_dropout=0.0,
-        mask_time_prob=0.05,
-        do_stable_layer_norm=False,
-        num_conv_pos_embedding_groups=16,
-        conv_pos_kernel_size=19,
-        num_conv_pos_embeddings=5,
-        activation_dropout=0.1,
-        add_adapter=False,
-        final_dropout=0.1,
-        proj_codevector_dim=256,
         **kwargs
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -152,31 +134,9 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
         self.classifier_dropout = classifier_dropout
-        # Here starts audio-only parameters
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_feat_extract_layers = len(self.conv_dim)
-        self.feat_proj_dropout = feat_proj_dropout
-        self.mask_time_prob = mask_time_prob
-        self.do_stable_layer_norm = do_stable_layer_norm
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.conv_pos_kernel_size = conv_pos_kernel_size
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        # Using the same dropout as for text
-        self.hidden_dropout = self.hidden_dropout_prob
-        # Using the same dropout as for text
-        self.attention_dropout = self.attention_probs_dropout_prob
-        self.activation_dropout = activation_dropout
-        self.add_adapter = add_adapter
-        self.final_dropout = final_dropout
-        self.proj_codevector_dim = proj_codevector_dim
-
-
-class Data2VecOnnxConfig(OnnxConfig):
+
+
+class Data2VecTextOnnxConfig(OnnxConfig):
     @property
     def inputs(self) -> Mapping[str, Mapping[int, str]]:
         return OrderedDict(
diff --git a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
index f039e8f89c708..9445afe54049c 100644
--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@@ -25,12 +25,11 @@
 import fairseq
 from datasets import load_dataset
 
-from transformers import Data2VecConfig, Data2VecForAudioModel
-from transformers.models.data2vec.modeling_data2vec import Data2VecForCTC
-
 # Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
-from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel  # noqa: F401
+from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel as Dummy  # noqa: F401
 
+from transformers.models.data2vec.modeling_data2vec_audio import Data2VecAudioModel, Data2VecAudioForCTC
+from transformers.models.data2vec.configuration_data2vec_audio import Data2VecAudioConfig
 
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
@@ -89,8 +88,8 @@ def recursively_load_weights(fairseq_model, hf_model, is_headless):
     fairseq_dict = fairseq_model.state_dict()
 
     if not is_headless:
-        feature_extractor = hf_model.data2vec.feature_extractor
-        pos_conv_embedding = hf_model.data2vec.encoder.pos_conv_embed
+        feature_extractor = hf_model.data2vec_audio.feature_extractor
+        pos_conv_embedding = hf_model.data2vec_audio.encoder.pos_conv_embed
 
     else:
         feature_extractor = hf_model.feature_extractor
@@ -117,7 +116,7 @@ def recursively_load_weights(fairseq_model, hf_model, is_headless):
         else:
             for key, mapped_key in MAPPING.items():
                 if not is_headless:
-                    mapped_key = "data2vec." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
+                    mapped_key = "data2vec_audio." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
                 if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
                     is_used = True
                     if "*" in mapped_key:
@@ -201,13 +200,13 @@ def convert_wav2vec2_checkpoint(
     Copy/paste/tweak model's weights to transformers design.
     """
     if config_path is not None:
-        config = Data2VecConfig.from_pretrained(config_path)
+        config = Data2VecAudioConfig.from_pretrained(config_path)
     else:
-        config = Data2VecConfig(layer_norm_eps=1e-5, vocab_size=32)
+        config = Data2VecAudioConfig()
 
     if not is_finetuned:
         # Modify final_proj layer name
-        hf_wav2vec = Data2VecForAudioModel(config)
+        hf_wav2vec = Data2VecAudioModel(config)
         data2vec_checkpoint_dir = os.path.dirname(checkpoint_path)
 
         state_dict = torch.load(checkpoint_path)
@@ -216,7 +215,7 @@ def convert_wav2vec2_checkpoint(
         converted_ckpt = os.path.join(data2vec_checkpoint_dir, "converted.pt")
         torch.save(state_dict, converted_ckpt)
     else:
-        hf_wav2vec = Data2VecForCTC(config)
+        hf_wav2vec = Data2VecAudioForCTC(config)
         converted_ckpt = checkpoint_path
 
     def load_data2vec(path):
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
new file mode 100755
index 0000000000000..933458d2b5d4d
--- /dev/null
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -0,0 +1,1831 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Data2VecAudio model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...deepspeed import is_deepspeed_zero3_enabled
+from ...file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel, torch_int_div
+from ...utils import logging
+from .configuration_data2vec_audio import Data2VecAudioConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# General docstring
+_CONFIG_FOR_DOC = "Data2VecAudioConfig"
+_PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/data2vec-audio-base-960h"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 53.48
+
+# Audio class docstring
+_FEAT_EXTRACTOR_FOR_DOC = "Wav2Vec2FeatureExtractor"
+_SEQ_CLASS_CHECKPOINT = "superb/data2vec-audio-base-superb-ks"
+_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
+_SEQ_CLASS_EXPECTED_LOSS = 6.54
+
+# Frame class docstring
+_FRAME_CLASS_CHECKPOINT = "anton-l/data2vec-audio-base-superb-sd"
+_FRAME_EXPECTED_OUTPUT = [0, 0]
+
+# Speaker Verification docstring
+_XVECTOR_CHECKPOINT = "anton-l/data2vec-audio-base-superb-sv"
+_XVECTOR_EXPECTED_OUTPUT = 0.98
+
+
+DATA_2_VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/data2vec-audio-base-960h",
+    "facebook/data2vec-audio-large-960h",
+    "facebook/data2vec-audio-large-960h-lv60",
+    "facebook/data2vec-audio-large-960h-lv60-self",
+    # See all Data2VecAudio models at https://huggingface.co/models?filter=data2vec-audio
+]
+
+
+@dataclass
+class Data2VecAudioBaseModelOutput(ModelOutput):
+    """
+    Output type of [`Data2VecAudioBaseModelOutput`], with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        extract_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
+            Sequence of extracted feature vectors of the last convolutional layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    extract_features: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class Data2VecAudioForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`Data2VecAudioForPreTraining`], with potential hidden states and attentions.
+
+    Args:
+        loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
+        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+            projected quantized states.
+        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+            target vectors for contrastive loss.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    projected_states: torch.FloatTensor = None
+    projected_quantized_states: torch.FloatTensor = None
+    codevector_perplexity: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    contrastive_loss: Optional[torch.FloatTensor] = None
+    diversity_loss: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class XVectorOutput(ModelOutput):
+    """
+    Output type of [`Data2VecAudioForXVector`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
+            Classification hidden states before AMSoftmax.
+        embeddings (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
+            Utterance embeddings used for vector similarity-based retrieval.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    embeddings: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.sum(-1).detach().tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+def _sample_negative_indices(
+    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
+):
+    """
+    Sample `num_negatives` vectors from feature vectors.
+    """
+    batch_size, sequence_length = features_shape
+
+    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
+    sequence_length_range = np.arange(sequence_length)
+
+    # get `num_negatives` random vector indices from the same utterance
+    sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
+
+    mask_time_indices = (
+        mask_time_indices.astype(np.bool) if mask_time_indices is not None else np.ones(features_shape, dtype=np.bool)
+    )
+
+    for batch_idx in range(batch_size):
+        high = mask_time_indices[batch_idx].sum() - 1
+        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
+
+        feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
+        sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
+        # avoid sampling the same positive vector, but keep the distribution uniform
+        sampled_indices[sampled_indices >= feature_indices] += 1
+
+        # remap to actual indices
+        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
+
+        # correct for batch size
+        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
+
+    return sampled_negative_indices
+
+
+class Data2VecAudioConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Data2VecAudioPadLayer(nn.Module):
+    def __init__(self, conv_pos_kernel_size):
+        super().__init__()
+        self.num_pad_remove = 1 if conv_pos_kernel_size % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+class Data2VecAudioPositionalConvLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.conv_pos_kernel_size,
+            padding=config.conv_pos_kernel_size // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        self.padding = Data2VecAudioPadLayer(config.conv_pos_kernel_size)
+        self.activation = ACT2FN[config.feat_extract_activation]
+        # no learnable parameters
+        self.layer_norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Data2VecAudioPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layers = nn.ModuleList([Data2VecAudioPositionalConvLayer(config) for _ in range(config.num_conv_pos_embeddings)])
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Data2VecAudioFeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.conv_layers = nn.ModuleList([
+            Data2VecAudioConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
+        ])
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
+
+        for conv_layer in self.conv_layers:
+            if self._requires_grad and self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(conv_layer),
+                    hidden_states,
+                )
+            else:
+                hidden_states = conv_layer(hidden_states)
+
+        return hidden_states
+
+
+class Data2VecAudioFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Data2VecAudio
+class Data2VecAudioAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim ** -0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class Data2VecAudioFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class Data2VecAudioEncoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = Data2VecAudioAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = Data2VecAudioFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Data2VecAudioEncoderLayerStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = Data2VecAudioAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = Data2VecAudioFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Data2VecAudioEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = Data2VecAudioPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Data2VecAudioEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states[~attention_mask] = 0.0
+
+            # extend attention_mask
+            attention_mask = (1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = np.random.uniform(0, 1)
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(layer),
+                        hidden_states,
+                        attention_mask,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Data2VecAudioAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
+        else:
+            self.proj = self.proj_layer_norm = None
+
+        self.layers = nn.ModuleList(Data2VecAudioAdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+
+    def forward(self, hidden_states):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+
+        for layer in self.layers:
+            layerdrop_prob = np.random.random()
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Data2VecAudioAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.output_hidden_size,
+            2 * config.output_hidden_size,
+            config.adapter_kernel_size,
+            stride=config.adapter_stride,
+            padding=1,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+
+        return hidden_states
+
+
+class Data2VecAudioPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Data2VecAudioConfig
+    base_model_prefix = "data2vec_audio"
+    main_input_name = "input_values"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, Data2VecAudioFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            if module.bias is not None:
+                module.bias.data.zero_()
+            if module.weight is not None:
+                module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch_int_div(input_length - kernel_size, stride) + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Data2VecAudioEncoder, Data2VecAudioFeatureEncoder)):
+            module.gradient_checkpointing = value
+
+
+DATA_2_VEC_AUDIO_START_DOCSTRING = r"""
+    Data2VecAudio was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
+    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
+    Auli.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`Data2VecAudioConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+DATA_2_VEC_AUDIO_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
+            into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
+            soundfile*). To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should be used for padding
+            and conversion into a tensor of type *torch.FloatTensor*. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            <Tip warning={true}>
+
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, such as
+            [data2vec-audio-base](https://huggingface.co/facebook/data2vec-audio-base-960h), `attention_mask` should **not** be
+            passed to avoid degraded performance when doing batched inference. For such models `input_values` should
+            simply be padded with 0 and passed without `attention_mask`. Be aware that these models also yield slightly
+            different results depending on whether `input_values` is padded or not.
+
+            </Tip>
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Data2VecAudio Model transformer outputting raw hidden-states without any specific head on top.",
+    DATA_2_VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioModel(Data2VecAudioPreTrainedModel):
+    def __init__(self, config: Data2VecAudioConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = Data2VecAudioFeatureEncoder(config)
+        self.feature_projection = Data2VecAudioFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
+
+        self.encoder = Data2VecAudioEncoder(config)
+
+        self.adapter = Data2VecAudioAdapter(config) if config.add_adapter else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.feature_extractor._freeze_parameters()
+
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+
+        return hidden_states
+
+    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_PROCESSOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Data2VecAudioBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        mask_time_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return Data2VecAudioBaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("""Data2VecAudio Model with a quantizer and `VQ` head on top.""", DATA_2_VEC_AUDIO_START_DOCSTRING)
+class Data2VecAudioForPreTraining(Data2VecAudioPreTrainedModel):
+    def __init__(self, config: Data2VecAudioConfig):
+        super().__init__(config)
+        self.data2vec_audio = Data2VecAudioModel(config)
+        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # make sure that project_hid & project_q are initialized like normal linear layers
+        self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
+        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
+
+    def set_gumbel_temperature(self, temperature: int):
+        """
+        Set the Gumbel softmax temperature to a given value. Only necessary for training
+        """
+        self.quantizer.temperature = temperature
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.data2vec_audio.feature_extractor._freeze_parameters()
+
+    @staticmethod
+    def compute_contrastive_logits(
+        target_features: torch.FloatTensor,
+        negative_features: torch.FloatTensor,
+        predicted_features: torch.FloatTensor,
+        temperature: int = 0.1,
+    ):
+        """
+        Compute logits for contrastive loss based using cosine similarity as the distance measure between
+        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
+        """
+        target_features = torch.cat([target_features, negative_features], dim=0)
+
+        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(
+            target_features
+        )
+
+        # apply temperature
+        logits = logits / temperature
+        return logits
+
+    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Data2VecAudioForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        mask_time_indices=None,
+        sampled_negative_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
+            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
+            Required input for pre-training.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import Wav2Vec2FeatureExtractor, Data2VecAudioForPreTraining
+        >>> from transformers.models.data2vec-audio.modeling_data2vec-audio import _compute_mask_indices
+        >>> from datasets import load_dataset
+        >>> import soundfile as sf
+
+        >>> feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("patrickvonplaten/data2vec-audio-base")
+        >>> model = Data2VecAudioForPreTraining.from_pretrained("patrickvonplaten/data2vec-audio-base")
+
+
+        >>> def map_to_array(batch):
+        ...     speech, _ = sf.read(batch["file"])
+        ...     batch["speech"] = speech
+        ...     return batch
+
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
+
+        >>> input_values = feature_extractor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
+
+        >>> # compute masked indices
+        >>> batch_size, raw_sequence_length = input_values.shape
+        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
+        >>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)
+        >>> mask_time_indices = torch.tensor(mask_time_indices, device=input_values.device, dtype=torch.long)
+
+        >>> with torch.no_grad():
+        ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
+
+        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
+        >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
+
+        >>> # show that cosine similarity is much higher than random
+        >>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5
+        tensor(True)
+
+        >>> # for contrastive loss training model should be put into train mode
+        >>> model = model.train()
+        >>> loss = model(input_values, mask_time_indices=mask_time_indices).loss
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if mask_time_indices is not None:
+            mask_time_indices = mask_time_indices.to(torch.bool)
+
+        outputs = self.data2vec_audio(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            mask_time_indices=mask_time_indices,
+            return_dict=return_dict,
+        )
+
+        # 1. project all transformed features (including masked) to final vq dim
+        transformer_features = self.project_hid(outputs[0])
+
+        # 2. quantize all (unmasked) extracted features and project to final vq dim
+        extract_features = self.dropout_features(outputs[1])
+
+        if attention_mask is not None:
+            # compute reduced attention_mask correponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        quantized_features, codevector_perplexity = self.quantizer(
+            extract_features, mask_time_indices=mask_time_indices
+        )
+        quantized_features = self.project_q(quantized_features)
+
+        loss = contrastive_loss = diversity_loss = None
+        if sampled_negative_indices is not None:
+            batch_size, sequence_length, hidden_size = quantized_features.shape
+
+            # for training, we sample negatives
+            # 3. sample K negatives (distractors) quantized states for contrastive loss
+            # if attention_mask is passed, make sure that padded feature vectors cannot be sampled
+            # sample negative quantized vectors BTC => (BxT)C
+            negative_quantized_features = quantized_features.view(-1, hidden_size)[
+                sampled_negative_indices.long().view(-1)
+            ]
+            negative_quantized_features = negative_quantized_features.view(
+                batch_size, sequence_length, -1, hidden_size
+            ).permute(2, 0, 1, 3)
+
+            # 4. compute logits, corresponding to `logs = sim(c_t, [q_t, \sim{q}_t]) / \kappa`
+            # of equation (3) in https://arxiv.org/pdf/2006.11477.pdf
+            logits = self.compute_contrastive_logits(
+                quantized_features[None, :],
+                negative_quantized_features,
+                transformer_features,
+                self.config.contrastive_logits_temperature,
+            )
+
+            # 5. if a negative vector is identical to the positive (i.e. when codebook utilization is low),
+            # its cosine similarity will be masked
+            neg_is_pos = (quantized_features == negative_quantized_features).all(-1)
+
+            if neg_is_pos.any():
+                logits[1:][neg_is_pos] = float("-inf")
+
+            # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) =
+            # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
+            logits = logits.transpose(0, 2).reshape(-1, logits.size(0))
+            target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten()
+
+            contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum")
+            # 7. compute diversity loss: \mathbf{L}_d
+            num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups
+            diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum()
+
+            # 8. \mathbf{L} = \mathbf{L}_m + \alpha * \mathbf{L}_d
+            loss = contrastive_loss + self.config.diversity_loss_weight * diversity_loss
+
+        if not return_dict:
+            if loss is not None:
+                return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+            return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+
+        return Data2VecAudioForPreTrainingOutput(
+            loss=loss,
+            projected_states=transformer_features,
+            projected_quantized_states=quantized_features,
+            codevector_perplexity=codevector_perplexity,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            contrastive_loss=contrastive_loss,
+            diversity_loss=diversity_loss,
+        )
+
+
+@add_start_docstrings(
+    """Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    DATA_2_VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForCTC(Data2VecAudioPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.data2vec_audio = Data2VecAudioModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.data2vec_audio.feature_extractor._freeze_parameters()
+
+    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_PROCESSOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        labels=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.data2vec_audio(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+
+            if labels.max() >= self.config.vocab_size:
+                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@add_start_docstrings(
+    """
+    Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
+    """,
+    DATA_2_VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.data2vec_audio = Data2VecAudioModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.data2vec_audio.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.data2vec_audio.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_SEQ_CLASS_CHECKPOINT,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        labels=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.data2vec_audio(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            hidden_states[~padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Data2VecAudio Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    DATA_2_VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.data2vec_audio = Data2VecAudioModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.data2vec_audio.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.data2vec_audio.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_FRAME_CLASS_CHECKPOINT,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_FRAME_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.data2vec_audio(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        logits = self.classifier(hidden_states)
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+
+        return TokenClassifierOutput(
+            loss=None,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class AMSoftmaxLoss(nn.Module):
+    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
+        super(AMSoftmaxLoss, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.num_labels = num_labels
+        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, hidden_states, labels):
+        labels = labels.flatten()
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
+        cos_theta = torch.mm(hidden_states, weight)
+        psi = cos_theta - self.margin
+
+        onehot = nn.functional.one_hot(labels, self.num_labels)
+        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
+        loss = self.loss(logits, labels)
+
+        return loss
+
+
+class TDNNLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
+        self.out_conv_dim = config.tdnn_dim[layer_id]
+        self.kernel_size = config.tdnn_kernel[layer_id]
+        self.dilation = config.tdnn_dilation[layer_id]
+
+        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
+        self.activation = nn.ReLU()
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.unsqueeze(1)
+        hidden_states = nn.functional.unfold(
+            hidden_states,
+            (self.kernel_size, self.in_conv_dim),
+            stride=(1, self.in_conv_dim),
+            dilation=(self.dilation, 1),
+        )
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.kernel(hidden_states)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+@add_start_docstrings(
+    """
+    Data2VecAudio Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    DATA_2_VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.data2vec_audio = Data2VecAudioModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
+
+        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
+        self.tdnn = nn.ModuleList(tdnn_layers)
+
+        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
+        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
+
+        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
+
+        self.init_weights()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.data2vec_audio.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.data2vec_audio.parameters():
+            param.requires_grad = False
+
+    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the TDNN layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size in self.config.tdnn_kernel:
+            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
+
+        return input_lengths
+
+    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_XVECTOR_CHECKPOINT,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_XVECTOR_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        labels=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.data2vec_audio(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/data2vec/modeling_data2vec.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
similarity index 75%
rename from src/transformers/models/data2vec/modeling_data2vec.py
rename to src/transformers/models/data2vec/modeling_data2vec_text.py
index a3420d84b195a..75ba37c2a4152 100644
--- a/src/transformers/models/data2vec/modeling_data2vec.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -12,18 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch Data2Vec model."""
+"""PyTorch Data2VecText model."""
 
 import math
 
-from typing import Union, Optional
 import torch
 import torch.utils.checkpoint
 from packaging import version
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-import numpy as np
 from ...activations import ACT2FN, gelu
 from ...file_utils import (
     add_code_sample_docstrings,
@@ -31,7 +29,6 @@
     add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
-from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -41,7 +38,6 @@
     QuestionAnsweringModelOutput,
     SequenceClassifierOutput,
     TokenClassifierOutput,
-    CausalLMOutput,
 )
 from ...modeling_utils import (
     PreTrainedModel,
@@ -49,16 +45,8 @@
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...modeling_utils import torch_int_div
-from ...modeling_outputs import BaseModelOutput
-from ...models.wav2vec2.modeling_wav2vec2 import (
-    Wav2Vec2Adapter,
-    Wav2Vec2BaseModelOutput,
-    Wav2Vec2FeatureEncoder,
-    Wav2Vec2EncoderLayer,
-)
 from ...utils import logging
-from .configuration_data2vec import Data2VecConfig
+from .configuration_data2vec import Data2VecTextConfig
 
 
 logger = logging.get_logger(__name__)
@@ -67,7 +55,7 @@
 _HIDDEN_STATES_START_POSITION = 2
 
 _CHECKPOINT_FOR_DOC = "data2vec"
-_CONFIG_FOR_DOC = "Data2VecConfig"
+_CONFIG_FOR_DOC = "Data2VecTextConfig"
 _TOKENIZER_FOR_DOC = "RobertaTokenizer"
 
 # General docstring
@@ -78,14 +66,14 @@
 _CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
 _CTC_EXPECTED_LOSS = 53.48
 
-DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST = [
+DATA2VECTEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "data2vec",
     # See all data2vec models at https://huggingface.co/models?filter=data2vec
 ]
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Data2Vec
-class Data2VecForTextEmbeddings(nn.Module):
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Data2VecText
+class Data2VecTextForTextEmbeddings(nn.Module):
     """
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
@@ -175,7 +163,7 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
         return position_ids.unsqueeze(0).expand(input_shape)
 
 
-class Data2VecSelfAttention(nn.Module):
+class Data2VecTextSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
@@ -275,7 +263,7 @@ def forward(
 
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in Data2VecForTextModel forward() function)
+            # Apply the attention mask is (precomputed for all layers in Data2VecTextModel forward() function)
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
@@ -303,7 +291,7 @@ def forward(
 
 
 # Copied from transformers.models.bert.modeling_bert.BertSelfOutput
-class Data2VecSelfOutput(nn.Module):
+class Data2VecTextSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -317,12 +305,12 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Data2Vec
-class Data2VecAttention(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Data2VecText
+class Data2VecTextAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = Data2VecSelfAttention(config, position_embedding_type=position_embedding_type)
-        self.output = Data2VecSelfOutput(config)
+        self.self = Data2VecTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = Data2VecTextSelfOutput(config)
         self.pruned_heads = set()
 
     def prune_heads(self, heads):
@@ -368,7 +356,7 @@ def forward(
 
 
 # Copied from transformers.models.bert.modeling_bert.BertIntermediate
-class Data2VecIntermediate(nn.Module):
+class Data2VecTextIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
@@ -384,7 +372,7 @@ def forward(self, hidden_states):
 
 
 # Copied from transformers.models.bert.modeling_bert.BertOutput
-class Data2VecOutput(nn.Module):
+class Data2VecTextOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
@@ -398,21 +386,21 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Data2Vec
-class Data2VecLayer(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Data2VecText
+class Data2VecTextLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = Data2VecAttention(config)
+        self.attention = Data2VecTextAttention(config)
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         if self.add_cross_attention:
             if not self.is_decoder:
                 raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
-            self.crossattention = Data2VecAttention(config, position_embedding_type="absolute")
-        self.intermediate = Data2VecIntermediate(config)
-        self.output = Data2VecOutput(config)
+            self.crossattention = Data2VecTextAttention(config, position_embedding_type="absolute")
+        self.intermediate = Data2VecTextIntermediate(config)
+        self.output = Data2VecTextOutput(config)
 
     def forward(
         self,
@@ -484,12 +472,12 @@ def feed_forward_chunk(self, attention_output):
         return layer_output
 
 
-# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Data2Vec
-class Data2VecEncoder(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Data2VecText
+class Data2VecTextEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
-        self.layer = nn.ModuleList([Data2VecLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList([Data2VecTextLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
     def forward(
@@ -583,7 +571,7 @@ def custom_forward(*inputs):
 
 
 # Copied from transformers.models.bert.modeling_bert.BertPooler
-class Data2VecPooler(nn.Module):
+class Data2VecTextPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -598,13 +586,14 @@ def forward(self, hidden_states):
         return pooled_output
 
 
-class Data2VecPreTrainedModel(PreTrainedModel):
+# Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->Data2VecText, roberta->data2vec-text
+class Data2VecTextPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
 
-    config_class = Data2VecConfig
+    config_class = Data2VecTextConfig
     base_model_prefix = "data2vec"
     supports_gradient_checkpointing = True
 
@@ -627,7 +616,7 @@ def _init_weights(self, module):
                 module.weight.data.fill_(1.0)
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, Data2VecEncoder):
+        if isinstance(module, Data2VecTextEncoder):
             module.gradient_checkpointing = value
 
     def update_keys_to_ignore(self, config, del_keys_to_ignore):
@@ -640,7 +629,7 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
             ]
 
 
-DATA2VEC_START_DOCSTRING = r"""
+DATA2VECTEXT_START_DOCSTRING = r"""
 
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -651,12 +640,12 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
     and behavior.
 
     Parameters:
-        config ([`Data2VecConfig`]): Model configuration class with all the parameters of the
+        config ([`Data2VecTextConfig`]): Model configuration class with all the parameters of the
             model. Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-DATA2VEC_INPUTS_DOCSTRING = r"""
+DATA2VECTEXT_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
@@ -706,393 +695,11 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
 """
 
 
-class Data2VecPadLayer(nn.Module):
-    def __init__(self, conv_pos_kernel_size):
-        super().__init__()
-        self.num_pad_remove = 1 if conv_pos_kernel_size % 2 == 0 else 0
-
-    def forward(self, hidden_states):
-        if self.num_pad_remove > 0:
-            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
-        return hidden_states
-
-
-class Data2VecPositionalConvLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.conv = nn.Conv1d(
-            config.hidden_size,
-            config.hidden_size,
-            kernel_size=config.conv_pos_kernel_size,
-            padding=config.conv_pos_kernel_size // 2,
-            groups=config.num_conv_pos_embedding_groups,
-        )
-
-        self.padding = Data2VecPadLayer(config.conv_pos_kernel_size)
-        self.activation = ACT2FN[config.feat_extract_activation]
-        # no learnable parameters
-        self.layer_norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False)
-
-    def forward(self, hidden_states):
-        hidden_states = self.conv(hidden_states)
-        hidden_states = self.padding(hidden_states)
-
-        hidden_states = hidden_states.transpose(1, 2)
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states = hidden_states.transpose(1, 2)
-        hidden_states = self.activation(hidden_states)
-        return hidden_states
-
-
-class Data2VecPositionalConvEmbedding(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.layers = nn.ModuleList([Data2VecPositionalConvLayer(config) for _ in range(config.num_conv_pos_embeddings)])
-
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.transpose(1, 2)
-        for layer in self.layers:
-            hidden_states = layer(hidden_states)
-        hidden_states = hidden_states.transpose(1, 2)
-        return hidden_states
-
-
-class Data2VecFeatureProjection(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
-        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
-        self.dropout = nn.Dropout(config.feat_proj_dropout)
-
-    def forward(self, hidden_states):
-        # non-projected hidden states are needed for quantization
-        norm_hidden_states = self.layer_norm(hidden_states)
-        hidden_states = self.projection(norm_hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        return hidden_states, norm_hidden_states
-
-
-class Data2VecAudioEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.pos_conv_embed = Data2VecPositionalConvEmbedding(config)
-        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout)
-        self.layers = nn.ModuleList([Wav2Vec2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-    ):
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-
-        if attention_mask is not None:
-            # make sure padded tokens output 0
-            hidden_states[~attention_mask] = 0.0
-
-            # extend attention_mask
-            attention_mask = (1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)) * -10000.0
-            attention_mask = attention_mask.expand(
-                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
-            )
-
-        position_embeddings = self.pos_conv_embed(hidden_states)
-        hidden_states = hidden_states + position_embeddings
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
-
-        for layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
-
-            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
-                if self.gradient_checkpointing and self.training:
-                    # create gradient checkpointing function
-                    def create_custom_forward(module):
-                        def custom_forward(*inputs):
-                            return module(*inputs, output_attentions)
-
-                        return custom_forward
-
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(layer),
-                        hidden_states,
-                        attention_mask,
-                    )
-                else:
-                    layer_outputs = layer(
-                        hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
-                    )
-                hidden_states = layer_outputs[0]
-
-            if skip_the_layer:
-                layer_outputs = (None, None)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
-@add_start_docstrings(
-    "The bare Data2Vec Model for audio transformer outputting raw hidden-states without any specific head on top.",
-    DATA2VEC_START_DOCSTRING,
-)
-class Data2VecForAudioModel(Data2VecPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.config = config
-        self.feature_extractor = Wav2Vec2FeatureEncoder(config)
-        self.feature_projection = Data2VecFeatureProjection(config)
-        self.dropout = nn.Dropout(config.hidden_dropout)
-
-        # model only needs masking vector if mask prob is > 0.0
-        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
-            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
-
-        self.encoder = Data2VecAudioEncoder(config)
-
-        self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def _get_feat_extract_output_lengths(
-        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
-    ):
-        """
-        Computes the output length of the convolutional layers
-        """
-
-        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
-
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
-
-        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
-            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
-
-        if add_adapter:
-            for _ in range(self.config.num_adapter_layers):
-                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
-
-        return input_lengths
-
-    def _get_feature_vector_attention_mask(
-        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
-    ):
-        # Effectively attention_mask.sum(-1), but not inplace to be able to run
-        # on inference mode.
-        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
-
-        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
-        output_lengths = output_lengths.to(torch.long)
-
-        batch_size = attention_mask.shape[0]
-
-        attention_mask = torch.zeros(
-            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
-        )
-        # these two operations makes sure that all values before the output lengths idxs are attended to
-        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
-        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
-        return attention_mask
-
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_values,
-        attention_mask=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        extract_features = self.feature_extractor(input_values)
-        extract_features = extract_features.transpose(1, 2)
-
-        if attention_mask is not None:
-            # compute reduced attention_mask corresponding to feature vectors
-            attention_mask = self._get_feature_vector_attention_mask(
-                extract_features.shape[1], attention_mask, add_adapter=False
-            )
-
-        hidden_states, extract_features = self.feature_projection(extract_features)
-
-        encoder_outputs = self.encoder(
-            hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = encoder_outputs[0]
-
-        if not return_dict:
-            return (hidden_states, extract_features) + encoder_outputs[1:]
-
-        return Wav2Vec2BaseModelOutput(
-            last_hidden_state=hidden_states,
-            extract_features=extract_features,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """Data2Vec Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
-    DATA2VEC_START_DOCSTRING,
-)
-class Data2VecForCTC(Data2VecPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.data2vec = Data2VecForAudioModel(config)
-        self.dropout = nn.Dropout(config.final_dropout)
-
-        if config.vocab_size is None:
-            raise ValueError(
-                f"You are trying to instantiate {self.__class__} with a configuration that "
-                "does not define the vocabulary size of the language model head. Please "
-                "instantiate the model as follows: `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
-                "or define `vocab_size` of your model's configuration."
-            )
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def freeze_feature_encoder(self):
-        """
-        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
-        not be updated during training.
-        """
-        self.wav2vec2.feature_extractor._freeze_parameters()
-
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=CausalLMOutput,
-        config_class=_CONFIG_FOR_DOC,
-        expected_output=_CTC_EXPECTED_OUTPUT,
-        expected_loss=_CTC_EXPECTED_LOSS,
-    )
-    def forward(
-        self,
-        input_values,
-        attention_mask=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        labels=None,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
-            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
-            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
-            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
-            config.vocab_size - 1]`.
-        """
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.data2vec(
-            input_values,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        hidden_states = self.dropout(hidden_states)
-
-        logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-
-            if labels.max() >= self.config.vocab_size:
-                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
-
-            # retrieve loss input_lengths from attention_mask
-            attention_mask = (
-                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
-            )
-            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
-
-            # assuming that padded tokens are filled with -100
-            # when not being attended to
-            labels_mask = labels >= 0
-            target_lengths = labels_mask.sum(-1)
-            flattened_targets = labels.masked_select(labels_mask)
-
-            # ctc_loss doesn't support fp16
-            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
-
-            with torch.backends.cudnn.flags(enabled=False):
-                loss = nn.functional.ctc_loss(
-                    log_probs,
-                    flattened_targets,
-                    input_lengths,
-                    target_lengths,
-                    blank=self.config.pad_token_id,
-                    reduction=self.config.ctc_loss_reduction,
-                    zero_infinity=self.config.ctc_zero_infinity,
-                )
-
-        if not return_dict:
-            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
-        )
-
-
 @add_start_docstrings(
-    "The bare Data2Vec Model for text transformer outputting raw hidden-states without any specific head on top.",
-    DATA2VEC_START_DOCSTRING,
+    "The bare Data2VecText Model for text transformer outputting raw hidden-states without any specific head on top.",
+    DATA2VECTEXT_START_DOCSTRING,
 )
-class Data2VecForTextModel(Data2VecPreTrainedModel):
+class Data2VecTextModel(Data2VecTextPreTrainedModel):
     """
 
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
@@ -1114,10 +721,10 @@ def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
 
-        self.embeddings = Data2VecForTextEmbeddings(config)
-        self.encoder = Data2VecEncoder(config)
+        self.embeddings = Data2VecTextForTextEmbeddings(config)
+        self.encoder = Data2VecTextEncoder(config)
 
-        self.pooler = Data2VecPooler(config) if add_pooling_layer else None
+        self.pooler = Data2VecTextPooler(config) if add_pooling_layer else None
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1136,7 +743,7 @@ class PreTrainedModel
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1275,9 +882,9 @@ def forward(
 
 
 @add_start_docstrings(
-    """Data2Vec Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VEC_START_DOCSTRING
+    """Data2VecText Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VECTEXT_START_DOCSTRING
 )
-class Data2VecForCausalLM(Data2VecPreTrainedModel):
+class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
@@ -1286,10 +893,10 @@ def __init__(self, config):
         super().__init__(config)
 
         if not config.is_decoder:
-            logger.warning("If you want to use `Data2VecLMHeadModel` as a standalone, add `is_decoder=True.`")
+            logger.warning("If you want to use `Data2VecTextLMHeadModel` as a standalone, add `is_decoder=True.`")
 
-        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
-        self.lm_head = Data2VecLMHead(config)
+        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.lm_head = Data2VecTextLMHead(config)
 
         # The LM head weights require special treatment only when they are tied with the word embeddings
         self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1303,7 +910,7 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head.decoder = new_embeddings
 
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1352,13 +959,13 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import Data2VecTokenizer, Data2VecForCausalLM, Data2VecConfig
+        >>> from transformers import Data2VecTextTokenizer, Data2VecTextForCausalLM, Data2VecTextConfig
         >>> import torch
 
-        >>> tokenizer = Data2VecTokenizer.from_pretrained("data2vec-base")
-        >>> config = Data2VecConfig.from_pretrained("data2vec-base")
+        >>> tokenizer = Data2VecTextTokenizer.from_pretrained("data2vec-base")
+        >>> config = Data2VecTextConfig.from_pretrained("data2vec-base")
         >>> config.is_decoder = True
-        >>> model = Data2VecForCausalLM.from_pretrained("data2vec-base", config=config)
+        >>> model = Data2VecTextForCausalLM.from_pretrained("data2vec-base", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1428,8 +1035,8 @@ def _reorder_cache(self, past, beam_idx):
         return reordered_past
 
 
-@add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VEC_START_DOCSTRING)
-class Data2VecForMaskedLM(Data2VecPreTrainedModel):
+@add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VECTEXT_START_DOCSTRING)
+class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
@@ -1439,12 +1046,12 @@ def __init__(self, config):
 
         if config.is_decoder:
             logger.warning(
-                "If you want to use `Data2VecForMaskedLM` make sure `config.is_decoder=False` for "
+                "If you want to use `Data2VecTextForMaskedLM` make sure `config.is_decoder=False` for "
                 "bi-directional self-attention."
             )
 
-        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
-        self.lm_head = Data2VecLMHead(config)
+        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.lm_head = Data2VecTextLMHead(config)
 
         # The LM head weights require special treatment only when they are tied with the word embeddings
         self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1458,7 +1065,7 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head.decoder = new_embeddings
 
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1524,9 +1131,9 @@ def forward(
         )
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->Data2Vec
-class Data2VecLMHead(nn.Module):
-    """Data2Vec Head for masked language modeling."""
+# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->Data2VecText
+class Data2VecTextLMHead(nn.Module):
+    """Data2VecText Head for masked language modeling."""
 
     def __init__(self, config):
         super().__init__()
@@ -1554,12 +1161,12 @@ def _tie_weights(self):
 
 @add_start_docstrings(
     """
-    Data2Vec Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    Data2VecText Model transformer with a sequence classification/regression head on top (a linear layer on top of the
     pooled output) e.g. for GLUE tasks.
     """,
-    DATA2VEC_START_DOCSTRING,
+    DATA2VECTEXT_START_DOCSTRING,
 )
-class Data2VecForSequenceClassification(Data2VecPreTrainedModel):
+class Data2VecTextForSequenceClassification(Data2VecTextPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def __init__(self, config):
@@ -1567,13 +1174,13 @@ def __init__(self, config):
         self.num_labels = config.num_labels
         self.config = config
 
-        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
-        self.classifier = Data2VecClassificationHead(config)
+        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.classifier = Data2VecTextClassificationHead(config)
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1652,18 +1259,18 @@ def forward(
 
 @add_start_docstrings(
     """
-    Data2Vec Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    Data2VecText Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
     softmax) e.g. for RocStories/SWAG tasks.
     """,
-    DATA2VEC_START_DOCSTRING,
+    DATA2VECTEXT_START_DOCSTRING,
 )
-class Data2VecForMultipleChoice(Data2VecPreTrainedModel):
+class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def __init__(self, config):
         super().__init__(config)
 
-        self.data2vec = Data2VecForTextModel(config)
+        self.data2vec = Data2VecTextModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, 1)
 
@@ -1671,7 +1278,7 @@ def __init__(self, config):
         self.post_init()
 
     @add_start_docstrings_to_model_forward(
-        DATA2VEC_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+        DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
@@ -1747,12 +1354,12 @@ def forward(
 
 @add_start_docstrings(
     """
-    Data2Vec Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Data2VecText Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
     Named-Entity-Recognition (NER) tasks.
     """,
-    DATA2VEC_START_DOCSTRING,
+    DATA2VECTEXT_START_DOCSTRING,
 )
-class Data2VecForTokenClassification(Data2VecPreTrainedModel):
+class Data2VecTextForTokenClassification(Data2VecTextPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
@@ -1760,7 +1367,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
+        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
         classifier_dropout = (
             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
         )
@@ -1770,7 +1377,7 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1830,8 +1437,8 @@ def forward(
         )
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->Data2Vec
-class Data2VecClassificationHead(nn.Module):
+# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->Data2VecText
+class Data2VecTextClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
 
     def __init__(self, config):
@@ -1855,12 +1462,12 @@ def forward(self, features, **kwargs):
 
 @add_start_docstrings(
     """
-    Data2Vec Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    Data2VecText Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
     layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
     """,
-    DATA2VEC_START_DOCSTRING,
+    DATA2VECTEXT_START_DOCSTRING,
 )
-class Data2VecForQuestionAnswering(Data2VecPreTrainedModel):
+class Data2VecTextForQuestionAnswering(Data2VecTextPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
@@ -1868,13 +1475,13 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.data2vec = Data2VecForTextModel(config, add_pooling_layer=False)
+        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    @add_start_docstrings_to_model_forward(DATA2VEC_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,

From 1a24eae1ccd676e75a4faff428e7358defcf62af Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 13:43:25 +0000
Subject: [PATCH 092/121] add inputs to logits to data2vec'

---
 .../models/data2vec/configuration_data2vec_audio.py          | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py
index b2d1856a63b5e..3e8176a119003 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_audio.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -16,6 +16,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+import math
 
 
 logger = logging.get_logger(__name__)
@@ -318,3 +319,7 @@ def __init__(
         self.tdnn_kernel = list(tdnn_kernel)
         self.tdnn_dilation = list(tdnn_dilation)
         self.xvector_output_dim = xvector_output_dim
+
+    @property
+    def inputs_to_logits_ratio(self):
+        return math.prod(self.conv_stride)

From 71d8b742aa328e14e431b24c0e4e9b62a24da9e8 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Sun, 20 Feb 2022 15:00:43 -0500
Subject: [PATCH 093/121] Update names and fix style, quality and repo
 consistency

---
 docs/source/model_doc/data2vec.mdx            |  66 ++-
 src/transformers/__init__.py                  |  52 +-
 .../models/auto/configuration_auto.py         |   4 +-
 src/transformers/models/auto/modeling_auto.py |  19 +-
 src/transformers/models/data2vec/__init__.py  |  58 +-
 ...vec_audio.py => configuration_data2vec.py} | 150 ++++-
 .../data2vec/configuration_data2vec_text.py   | 147 -----
 ..._original_pytorch_checkpoint_to_pytorch.py |  30 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |  10 +-
 .../data2vec/modeling_data2vec_audio.py       |  63 +--
 .../models/data2vec/modeling_data2vec_text.py |  13 +-
 src/transformers/utils/dummy_pt_objects.py    |  60 +-
 tests/test_modeling_data2vec.py               | 519 -----------------
 tests/test_modeling_data2vec_audio.py         |  14 +
 tests/test_modeling_data2vec_text.py          | 534 ++++++++++++++++++
 utils/check_repo.py                           |   5 +-
 utils/tests_fetcher.py                        |   1 +
 17 files changed, 929 insertions(+), 816 deletions(-)
 rename src/transformers/models/data2vec/{configuration_data2vec_audio.py => configuration_data2vec.py} (71%)
 delete mode 100644 src/transformers/models/data2vec/configuration_data2vec_text.py
 create mode 100644 tests/test_modeling_data2vec_audio.py
 create mode 100644 tests/test_modeling_data2vec_text.py

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
index 2471ccc883c32..6474ceb4758ca 100644
--- a/docs/source/model_doc/data2vec.mdx
+++ b/docs/source/model_doc/data2vec.mdx
@@ -40,46 +40,76 @@ This model was contributed by [edugp](https://huggingface.co/edugp).
 The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/data2vec).
 
 
-## Data2VecConfig
+## Data2VecTextConfig
 
-[[autodoc]] Data2VecConfig
+[[autodoc]] Data2VecTextConfig
 
-## Data2VecForAudioModel
+## Data2VecAudioConfig
 
-[[autodoc]] Data2VecForAudioModel
+[[autodoc]] Data2VecAudioConfig
+
+## Data2VecAudioModel
+
+[[autodoc]] Data2VecAudioModel
+    - forward
+
+
+## Data2VecAudioForAudioFrameClassification
+
+[[autodoc]] Data2VecAudioForAudioFrameClassification
+    - forward
+
+## Data2VecAudioForCTC
+
+[[autodoc]] Data2VecAudioForCTC
+    - forward
+
+## Data2VecAudioForPreTraining
+
+[[autodoc]] Data2VecAudioForPreTraining
+    - forward
+
+## Data2VecAudioForSequenceClassification
+
+[[autodoc]] Data2VecAudioForSequenceClassification
+    - forward
+
+## Data2VecAudioForXVector
+
+[[autodoc]] Data2VecAudioForXVector
     - forward
 
-## Data2VecForTextModel
+## Data2VecTextModel
 
-[[autodoc]] Data2VecForTextModel
+[[autodoc]] Data2VecTextModel
     - forward
 
-## Data2VecForCausalLM
+## Data2VecTextForCausalLM
 
-[[autodoc]] Data2VecForCausalLM
+[[autodoc]] Data2VecTextForCausalLM
     - forward
 
-## Data2VecForMaskedLM
+## Data2VecTextForMaskedLM
 
-[[autodoc]] Data2VecForMaskedLM
+[[autodoc]] Data2VecTextForMaskedLM
     - forward
 
-## Data2VecForSequenceClassification
+## Data2VecTextForSequenceClassification
 
-[[autodoc]] Data2VecForSequenceClassification
+[[autodoc]] Data2VecTextForSequenceClassification
     - forward
 
-## Data2VecForMultipleChoice
+## Data2VecTextForMultipleChoice
 
-[[autodoc]] Data2VecForMultipleChoice
+[[autodoc]] Data2VecTextForMultipleChoice
     - forward
 
-## Data2VecForTokenClassification
+## Data2VecTextForTokenClassification
 
-[[autodoc]] Data2VecForTokenClassification
+[[autodoc]] Data2VecTextForTokenClassification
     - forward
 
-## Data2VecForQuestionAnswering
+## Data2VecTextForQuestionAnswering
 
-[[autodoc]] Data2VecForQuestionAnswering
+[[autodoc]] Data2VecTextForQuestionAnswering
     - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index baec2791a6042..3d1ab30f10251 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -204,7 +204,7 @@
     "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
     "models.cpm": ["CpmTokenizer"],
     "models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
-    "models.data2vec": ["DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecConfig"],
+    "models.data2vec": ["DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecTextConfig"],
     "models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"],
     "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
     "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
@@ -875,15 +875,21 @@
     _import_structure["models.data2vec"].extend(
         [
             "DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "Data2VecForAudioModel",
-            "Data2VecForCausalLM",
-            "Data2VecForMaskedLM",
-            "Data2VecForMultipleChoice",
-            "Data2VecForQuestionAnswering",
-            "Data2VecForSequenceClassification",
-            "Data2VecForTextModel",
-            "Data2VecForTokenClassification",
-            "Data2VecPreTrainedModel",
+            "Data2VecAudioForAudioFrameClassification",
+            "Data2VecAudioForCTC",
+            "Data2VecAudioForPreTraining",
+            "Data2VecAudioForSequenceClassification",
+            "Data2VecAudioForXVector",
+            "Data2VecAudioModel",
+            "Data2VecAudioPreTrainedModel",
+            "Data2VecTextForCausalLM",
+            "Data2VecTextForMaskedLM",
+            "Data2VecTextForMultipleChoice",
+            "Data2VecTextForQuestionAnswering",
+            "Data2VecTextForSequenceClassification",
+            "Data2VecTextForTokenClassification",
+            "Data2VecTextModel",
+            "Data2VecTextPreTrainedModel",
         ]
     )
     _import_structure["models.deberta"].extend(
@@ -2475,7 +2481,7 @@
     from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
     from .models.cpm import CpmTokenizer
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
-    from .models.data2vec import DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecConfig
+    from .models.data2vec import DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecTextConfig
     from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer
     from .models.deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
@@ -3031,15 +3037,21 @@
         )
         from .models.data2vec import (
             DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
-            Data2VecForAudioModel,
-            Data2VecForCausalLM,
-            Data2VecForMaskedLM,
-            Data2VecForMultipleChoice,
-            Data2VecForQuestionAnswering,
-            Data2VecForSequenceClassification,
-            Data2VecForTextModel,
-            Data2VecForTokenClassification,
-            Data2VecPreTrainedModel,
+            Data2VecAudioForAudioFrameClassification,
+            Data2VecAudioForCTC,
+            Data2VecAudioForPreTraining,
+            Data2VecAudioForSequenceClassification,
+            Data2VecAudioForXVector,
+            Data2VecAudioModel,
+            Data2VecAudioPreTrainedModel,
+            Data2VecTextForCausalLM,
+            Data2VecTextForMaskedLM,
+            Data2VecTextForMultipleChoice,
+            Data2VecTextForQuestionAnswering,
+            Data2VecTextForSequenceClassification,
+            Data2VecTextForTokenClassification,
+            Data2VecTextModel,
+            Data2VecTextPreTrainedModel,
         )
         from .models.deberta import (
             DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index df83f2b872798..a63fc53f497ef 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -91,7 +91,7 @@
         ("reformer", "ReformerConfig"),
         ("longformer", "LongformerConfig"),
         ("roberta", "RobertaConfig"),
-        ("data2vec", "Data2VecConfig"),
+        ("data2vec", "Data2VecTextConfig"),
         ("deberta-v2", "DebertaV2Config"),
         ("deberta", "DebertaConfig"),
         ("flaubert", "FlaubertConfig"),
@@ -178,7 +178,7 @@
         ("xlnet", "XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xlm", "XLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("roberta", "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("data2vec", "DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("data2vec", "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("distilbert", "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 772aa885f4d98..4ebbe0f9bd4f4 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -83,7 +83,7 @@
         ("bart", "BartModel"),
         ("longformer", "LongformerModel"),
         ("roberta", "RobertaModel"),
-        ("data2vec", "Data2VecForTextModel"),
+        ("data2vec", "Data2VecTextModel"),
         ("layoutlm", "LayoutLMModel"),
         ("squeezebert", "SqueezeBertModel"),
         ("bert", "BertModel"),
@@ -134,7 +134,7 @@
         ("fsmt", "FSMTForConditionalGeneration"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
-        ("data2vec", "Data2VecForMaskedLM"),
+        ("data2vec", "Data2VecTextForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForPreTraining"),
         ("big_bird", "BigBirdForPreTraining"),
@@ -193,7 +193,7 @@
         ("bart", "BartForConditionalGeneration"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
-        ("data2vec", "Data2VecForMaskedLM"),
+        ("data2vec", "Data2VecTextForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForMaskedLM"),
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
@@ -255,6 +255,7 @@
         ("blenderbot-small", "BlenderbotSmallForCausalLM"),
         ("megatron-bert", "MegatronBertForCausalLM"),
         ("speech_to_text_2", "Speech2Text2ForCausalLM"),
+        ("data2vec", "Data2VecTextForCausalLM"),
     ]
 )
 
@@ -341,7 +342,7 @@
         ("xlm-roberta", "XLMRobertaForMaskedLM"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
-        ("data2vec", "Data2VecForMaskedLM"),
+        ("data2vec", "Data2VecTextForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForMaskedLM"),
         ("megatron-bert", "MegatronBertForMaskedLM"),
@@ -422,7 +423,7 @@
         ("bart", "BartForSequenceClassification"),
         ("longformer", "LongformerForSequenceClassification"),
         ("roberta", "RobertaForSequenceClassification"),
-        ("data2vec", "Data2VecForSequenceClassification"),
+        ("data2vec", "Data2VecTextForSequenceClassification"),
         ("squeezebert", "SqueezeBertForSequenceClassification"),
         ("layoutlm", "LayoutLMForSequenceClassification"),
         ("bert", "BertForSequenceClassification"),
@@ -489,6 +490,7 @@
         ("deberta-v2", "DebertaV2ForQuestionAnswering"),
         ("ibert", "IBertForQuestionAnswering"),
         ("splinter", "SplinterForQuestionAnswering"),
+        ("data2vec", "Data2VecTextForQuestionAnswering"),
     ]
 )
 
@@ -535,6 +537,7 @@
         ("deberta-v2", "DebertaV2ForTokenClassification"),
         ("gpt2", "GPT2ForTokenClassification"),
         ("ibert", "IBertForTokenClassification"),
+        ("data2vec", "Data2VecTextForTokenClassification"),
     ]
 )
 
@@ -556,7 +559,7 @@
         ("xlm-roberta", "XLMRobertaForMultipleChoice"),
         ("longformer", "LongformerForMultipleChoice"),
         ("roberta", "RobertaForMultipleChoice"),
-        ("data2vec", "Data2VecForMultipleChoice"),
+        ("data2vec", "Data2VecTextForMultipleChoice"),
         ("squeezebert", "SqueezeBertForMultipleChoice"),
         ("bert", "BertForMultipleChoice"),
         ("distilbert", "DistilBertForMultipleChoice"),
@@ -592,6 +595,7 @@
         ("sew", "SEWForSequenceClassification"),
         ("sew-d", "SEWDForSequenceClassification"),
         ("wavlm", "WavLMForSequenceClassification"),
+        ("data2vec", "Data2VecAudioForSequenceClassification"),
     ]
 )
 
@@ -605,6 +609,7 @@
         ("sew", "SEWForCTC"),
         ("sew-d", "SEWDForCTC"),
         ("wavlm", "WavLMForCTC"),
+        ("data2vec", "Data2VecAudioForCTC"),
     ]
 )
 
@@ -614,6 +619,7 @@
         ("wav2vec2", "Wav2Vec2ForAudioFrameClassification"),
         ("unispeech-sat", "UniSpeechSatForAudioFrameClassification"),
         ("wavlm", "WavLMForAudioFrameClassification"),
+        ("data2vec", "Data2VecAudioForAudioFrameClassification"),
     ]
 )
 
@@ -623,6 +629,7 @@
         ("wav2vec2", "Wav2Vec2ForXVector"),
         ("unispeech-sat", "UniSpeechSatForXVector"),
         ("wavlm", "WavLMForXVector"),
+        ("data2vec", "Data2VecAudioForXVector"),
     ]
 )
 
diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
index 1abf2a9f9efac..8abc7d3def191 100644
--- a/src/transformers/models/data2vec/__init__.py
+++ b/src/transformers/models/data2vec/__init__.py
@@ -22,53 +22,71 @@
 
 
 _import_structure = {
-    "configuration_data2vec_text": ["DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecTextConfig", "Data2VecTextOnnxConfig"],
-    "configuration_data2vec_audio": ["DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecAudioConfig", "Data2VecAudioOnnxConfig"],
+    "configuration_data2vec": [
+        "DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "Data2VecAudioConfig",
+        "Data2VecAudioOnnxConfig",
+        "Data2VecTextConfig",
+        "Data2VecTextOnnxConfig",
+    ],
 }
 
 if is_torch_available():
+    _import_structure["modeling_data2vec_audio"] = [
+        "DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Data2VecAudioForAudioFrameClassification",
+        "Data2VecAudioForCTC",
+        "Data2VecAudioForPreTraining",
+        "Data2VecAudioForSequenceClassification",
+        "Data2VecAudioForXVector",
+        "Data2VecAudioModel",
+        "Data2VecAudioPreTrainedModel",
+    ]
     _import_structure["modeling_data2vec_text"] = [
         "DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "Data2VecTextModel",
         "Data2VecTextForCausalLM",
         "Data2VecTextForMaskedLM",
         "Data2VecTextForMultipleChoice",
         "Data2VecTextForQuestionAnswering",
         "Data2VecTextForSequenceClassification",
-        "Data2VecTextForTextModel",
         "Data2VecTextForTokenClassification",
+        "Data2VecTextModel",
         "Data2VecTextPreTrainedModel",
     ]
-    _import_structure["modeling_data2vec_audio"] = [
-        "DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "Data2VecAudioModel",
-        "Data2VecAudioPreTrainedModel",
-        "Data2VecAudioForCTC",
-    ]
 
 if TYPE_CHECKING:
-    from .configuration_data2vec_text import DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecTextConfig, Data2VecTextOnnxConfig
-    from .configuration_data2vec_audio import DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecAudioConfig, Data2VecAudioOnnxConfig
+    from .configuration_data2vec import (
+        DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        Data2VecAudioConfig,
+        Data2VecAudioOnnxConfig,
+        Data2VecTextConfig,
+        Data2VecTextOnnxConfig,
+    )
 
     if is_torch_available():
+        from .modeling_data2vec_audio import (
+            DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Data2VecAudioForAudioFrameClassification,
+            Data2VecAudioForCTC,
+            Data2VecAudioForPreTraining,
+            Data2VecAudioForSequenceClassification,
+            Data2VecAudioForXVector,
+            Data2VecAudioModel,
+            Data2VecAudioPreTrainedModel,
+        )
         from .modeling_data2vec_text import (
             DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
-            Data2VecTextForAudioModel,
             Data2VecTextForCausalLM,
             Data2VecTextForMaskedLM,
             Data2VecTextForMultipleChoice,
             Data2VecTextForQuestionAnswering,
             Data2VecTextForSequenceClassification,
-            Data2VecTextForTextModel,
             Data2VecTextForTokenClassification,
+            Data2VecTextModel,
             Data2VecTextPreTrainedModel,
         )
-        from .modeling_data2vec_audio import (
-            DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST,
-            Data2VecAudioModel,
-            Data2VecAudioPreTrainedModel,
-            Data2VecAudioForCTC,
-        )
 
 else:
     import sys
diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec.py
similarity index 71%
rename from src/transformers/models/data2vec/configuration_data2vec_audio.py
rename to src/transformers/models/data2vec/configuration_data2vec.py
index 3e8176a119003..4958f60135aff 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_audio.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,26 +12,152 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Data2VecAudio model configuration"""
+""" Data2VecText configuration"""
+from collections import OrderedDict
+from typing import Mapping
 
 from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
 from ...utils import logging
 import math
 
 
 logger = logging.get_logger(__name__)
 
-DATA_2_VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "edugp/data2vec-nlp-base": "https://huggingface.co/data2vec/resolve/main/config.json",
+}
+
+DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "facebook/data2vec-base-960h": "https://huggingface.co/facebook/data2vec-audio-base-960h/resolve/main/config.json",
     # See all Data2VecAudio models at https://huggingface.co/models?filter=data2vec-audio
 }
 
 
+class Data2VecTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Data2VecTextModel`] and [`Data2VecTextModel`]. It
+    is used to instantiate a Data2VecText model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2VecText
+    [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import Data2VecTextModel, Data2VecTextConfig
+
+    >>> # Initializing a Data2VecText edugp/data2vec-nlp-base style configuration
+    >>> configuration = Data2VecTextConfig()
+
+    >>> # Initializing a model from the edugp/data2vec-nlp-base style configuration
+    >>> model = Data2VecTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "data2vec_text"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+class Data2VecTextOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+
 class Data2VecAudioConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Data2VecAudioModel`]. It is used to instantiate an
-    Data2VecAudio model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the Data2VecAudio
+    This is the configuration class to store the configuration of a [`Data2VecAudioModel`]. It is used to instantiate
+    an Data2VecAudio model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Data2VecAudio
     [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -40,10 +166,10 @@ class Data2VecAudioConfig(PretrainedConfig):
 
     Args:
         vocab_size (`int`, *optional*, defaults to 32):
-            Vocabulary size of the Data2VecAudio model. Defines the number of different tokens that can be represented by
-            the `inputs_ids` passed when calling [`Data2VecAudioModel`] or [`TFData2VecAudioModel`]. Vocabulary size of the
-            model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward
-            method of [`Data2VecAudioModel`].
+            Vocabulary size of the Data2VecAudio model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`Data2VecAudioModel`] or [`TFData2VecAudioModel`]. Vocabulary size
+            of the model. Defines the different tokens that can be represented by the *inputs_ids* passed to the
+            forward method of [`Data2VecAudioModel`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -157,8 +283,8 @@ class Data2VecAudioConfig(PretrainedConfig):
         xvector_output_dim (`int`, *optional*, defaults to 512):
             Dimensionality of the *XVector* embedding vectors.
         add_adapter (`bool`, *optional*, defaults to `False`):
-            Whether a convolutional network should be stacked on top of the Data2VecAudio Encoder. Can be very useful for
-            warm-starting Data2VecAudio for SpeechEncoderDecoder models.
+            Whether a convolutional network should be stacked on top of the Data2VecAudio Encoder. Can be very useful
+            for warm-starting Data2VecAudio for SpeechEncoderDecoder models.
         adapter_kernel_size (`int`, *optional*, defaults to 3):
             Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
         adapter_stride (`int`, *optional*, defaults to 2):
diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py
deleted file mode 100644
index a6843c650cf1c..0000000000000
--- a/src/transformers/models/data2vec/configuration_data2vec_text.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Data2VecText configuration"""
-from collections import OrderedDict
-from typing import Mapping
-
-from ...configuration_utils import PretrainedConfig
-from ...onnx import OnnxConfig
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-DATA2VEC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "edugp/data2vec-nlp-base": "https://huggingface.co/data2vec/resolve/main/config.json",
-}
-
-
-class Data2VecTextConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Data2VecTextForTextModel`] and
-    [`Data2VecTextForAudioModel`]. It is used to instantiate a Data2VecText model according to the specified arguments,
-    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the Data2VecText [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 30522):
-            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"silu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the attention probabilities.
-        max_position_embeddings (`int`, *optional*, defaults to 512):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the layer normalization layers.
-        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
-            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
-            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
-            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
-            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
-            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        classifier_dropout (`float`, *optional*):
-            The dropout ratio for the classification head.
-
-    Examples:
-
-    ```python
-    >>> from transformers import Data2VecTextModel, Data2VecTextConfig
-
-    >>> # Initializing a Data2VecText edugp/data2vec-nlp-base style configuration
-    >>> configuration = Data2VecTextConfig()
-
-    >>> # Initializing a model from the edugp/data2vec-nlp-base style configuration
-    >>> model = Data2VecTextModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "data2vec_text"
-
-    def __init__(
-        self,
-        vocab_size=30522,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
-        position_embedding_type="absolute",
-        use_cache=True,
-        classifier_dropout=None,
-        **kwargs
-    ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.position_embedding_type = position_embedding_type
-        self.use_cache = use_cache
-        self.classifier_dropout = classifier_dropout
-
-
-class Data2VecTextOnnxConfig(OnnxConfig):
-    @property
-    def inputs(self) -> Mapping[str, Mapping[int, str]]:
-        return OrderedDict(
-            [
-                ("input_ids", {0: "batch", 1: "sequence"}),
-                ("attention_mask", {0: "batch", 1: "sequence"}),
-            ]
-        )
diff --git a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
index 9445afe54049c..4ffdb27c082d8 100644
--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@@ -17,19 +17,19 @@
 
 import argparse
 import os
-
-import torch
 from functools import reduce
 
-from transformers import Wav2Vec2Processor, logging
 import fairseq
+import torch
 from datasets import load_dataset
 
+from transformers import Wav2Vec2Processor, logging
+from transformers.models.data2vec.configuration_data2vec import Data2VecAudioConfig
+
 # Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
 from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel as Dummy  # noqa: F401
+from transformers.models.data2vec.modeling_data2vec_audio import Data2VecAudioForCTC, Data2VecAudioModel
 
-from transformers.models.data2vec.modeling_data2vec_audio import Data2VecAudioModel, Data2VecAudioForCTC
-from transformers.models.data2vec.configuration_data2vec_audio import Data2VecAudioConfig
 
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
@@ -151,9 +151,7 @@ def set_weights(full_name, module, fsq_value, hf_weight_path):
     hf_value = hf_weight.data
 
     if fsq_value.shape != hf_value.shape:
-        raise ValueError(
-            f"{full_name} has size {fsq_value.shape}, but {hf_value.shape} was found."
-        )
+        raise ValueError(f"{full_name} has size {fsq_value.shape}, but {hf_value.shape} was found.")
     hf_weight.data = fsq_value
     logger.info(f"{full_name} was correctly initialized from {hf_weight_path}.")
 
@@ -235,15 +233,15 @@ def load_data2vec(path):
 
     input_values = inputs.input_values
     attention_mask = inputs.attention_mask
-#    input_values = inputs.input_values[:, :-1]
-#    attention_mask = inputs.attention_mask[:, :-1]
+    #    input_values = inputs.input_values[:, :-1]
+    #    attention_mask = inputs.attention_mask[:, :-1]
 
     hf_wav2vec.eval()
     model.eval()
     if is_finetuned:
-        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)["encoder_out"].transpose(
-            0, 1
-        )
+        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)[
+            "encoder_out"
+        ].transpose(0, 1)
         our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["logits"]
 
         pred_ids = torch.argmax(our_output, dim=-1)
@@ -251,9 +249,9 @@ def load_data2vec(path):
 
         print(f"Expected Output: {ds[:4]['text']}, Pred: {output_string}")
     else:
-        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)["layer_results"][-1][0].transpose(
-            0, 1
-        )
+        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)[
+            "layer_results"
+        ][-1][0].transpose(0, 1)
         our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["last_hidden_state"]
 
     print(our_output.shape, their_output.shape)
diff --git a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
index 4173bc3085c5e..5dff263d7ab55 100644
--- a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
@@ -24,7 +24,7 @@
 from fairseq.modules import TransformerSentenceEncoderLayer
 from packaging import version
 
-from transformers import Data2VecConfig, Data2VecForMaskedLM, Data2VecForSequenceClassification
+from transformers import Data2VecTextConfig, Data2VecTextForMaskedLM, Data2VecTextForSequenceClassification
 from transformers.models.bert.modeling_bert import (
     BertIntermediate,
     BertLayer,
@@ -61,7 +61,7 @@ def convert_data2vec_checkpoint_to_pytorch(
     data2vec.eval()  # disable dropout
     data2vec_model = data2vec.models[0]
     data2vec_sent_encoder = data2vec_model.encoder.sentence_encoder
-    config = Data2VecConfig(
+    config = Data2VecTextConfig(
         vocab_size=data2vec_sent_encoder.embed_tokens.num_embeddings,
         hidden_size=data2vec_model.args.encoder_embed_dim,
         num_hidden_layers=data2vec_model.args.encoder_layers,
@@ -75,7 +75,7 @@ def convert_data2vec_checkpoint_to_pytorch(
         config.num_labels = data2vec.model.classification_heads["mnli"].out_proj.weight.shape[0]
     print("Our BERT config:", config)
 
-    model = Data2VecForSequenceClassification(config) if classification_head else Data2VecForMaskedLM(config)
+    model = Data2VecTextForSequenceClassification(config) if classification_head else Data2VecTextForMaskedLM(config)
     model.eval()
 
     # Now let's copy all the weights.
@@ -180,7 +180,7 @@ def convert_data2vec_checkpoint_to_pytorch(
     parser = argparse.ArgumentParser()
     # Required parameters
     parser.add_argument(
-        "--data2vec_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
+        "--checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
     )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
@@ -190,5 +190,5 @@ def convert_data2vec_checkpoint_to_pytorch(
     )
     args = parser.parse_args()
     convert_data2vec_checkpoint_to_pytorch(
-        args.data2vec_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
+        args.checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
     )
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 933458d2b5d4d..5fd4cd29688c5 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -33,15 +33,10 @@
     add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
-from ...modeling_outputs import (
-    BaseModelOutput,
-    CausalLMOutput,
-    SequenceClassifierOutput,
-    TokenClassifierOutput,
-)
+from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput
 from ...modeling_utils import PreTrainedModel, torch_int_div
 from ...utils import logging
-from .configuration_data2vec_audio import Data2VecAudioConfig
+from .configuration_data2vec import Data2VecAudioConfig
 
 
 logger = logging.get_logger(__name__)
@@ -76,7 +71,7 @@
 _XVECTOR_EXPECTED_OUTPUT = 0.98
 
 
-DATA_2_VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST = [
+DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "facebook/data2vec-audio-base-960h",
     "facebook/data2vec-audio-large-960h",
     "facebook/data2vec-audio-large-960h-lv60",
@@ -411,7 +406,9 @@ def forward(self, hidden_states):
 class Data2VecAudioPositionalConvEmbedding(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.layers = nn.ModuleList([Data2VecAudioPositionalConvLayer(config) for _ in range(config.num_conv_pos_embeddings)])
+        self.layers = nn.ModuleList(
+            [Data2VecAudioPositionalConvLayer(config) for _ in range(config.num_conv_pos_embeddings)]
+        )
 
     def forward(self, hidden_states):
         hidden_states = hidden_states.transpose(1, 2)
@@ -426,9 +423,9 @@ class Data2VecAudioFeatureEncoder(nn.Module):
 
     def __init__(self, config):
         super().__init__()
-        self.conv_layers = nn.ModuleList([
-            Data2VecAudioConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
-        ])
+        self.conv_layers = nn.ModuleList(
+            [Data2VecAudioConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
+        )
         self.gradient_checkpointing = False
         self._requires_grad = True
 
@@ -930,7 +927,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             module.gradient_checkpointing = value
 
 
-DATA_2_VEC_AUDIO_START_DOCSTRING = r"""
+DATA2VEC_AUDIO_START_DOCSTRING = r"""
     Data2VecAudio was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
     Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
     Auli.
@@ -949,7 +946,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-DATA_2_VEC_AUDIO_INPUTS_DOCSTRING = r"""
+DATA2VEC_AUDIO_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
             Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
@@ -969,10 +966,10 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
             True`. For all models whose processor has `config.return_attention_mask == False`, such as
-            [data2vec-audio-base](https://huggingface.co/facebook/data2vec-audio-base-960h), `attention_mask` should **not** be
-            passed to avoid degraded performance when doing batched inference. For such models `input_values` should
-            simply be padded with 0 and passed without `attention_mask`. Be aware that these models also yield slightly
-            different results depending on whether `input_values` is padded or not.
+            [data2vec-audio-base](https://huggingface.co/facebook/data2vec-audio-base-960h), `attention_mask` should
+            **not** be passed to avoid degraded performance when doing batched inference. For such models
+            `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these
+            models also yield slightly different results depending on whether `input_values` is padded or not.
 
             </Tip>
 
@@ -989,7 +986,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 @add_start_docstrings(
     "The bare Data2VecAudio Model transformer outputting raw hidden-states without any specific head on top.",
-    DATA_2_VEC_AUDIO_START_DOCSTRING,
+    DATA2VEC_AUDIO_START_DOCSTRING,
 )
 class Data2VecAudioModel(Data2VecAudioPreTrainedModel):
     def __init__(self, config: Data2VecAudioConfig):
@@ -1062,7 +1059,7 @@ def _mask_hidden_states(
 
         return hidden_states
 
-    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1124,7 +1121,7 @@ def forward(
         )
 
 
-@add_start_docstrings("""Data2VecAudio Model with a quantizer and `VQ` head on top.""", DATA_2_VEC_AUDIO_START_DOCSTRING)
+@add_start_docstrings("""Data2VecAudio Model with a quantizer and `VQ` head on top.""", DATA2VEC_AUDIO_START_DOCSTRING)
 class Data2VecAudioForPreTraining(Data2VecAudioPreTrainedModel):
     def __init__(self, config: Data2VecAudioConfig):
         super().__init__(config)
@@ -1172,7 +1169,7 @@ def compute_contrastive_logits(
         logits = logits / temperature
         return logits
 
-    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Data2VecAudioForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1199,7 +1196,7 @@ def forward(
         ```python
         >>> import torch
         >>> from transformers import Wav2Vec2FeatureExtractor, Data2VecAudioForPreTraining
-        >>> from transformers.models.data2vec-audio.modeling_data2vec-audio import _compute_mask_indices
+        >>> from transformers.models.data2vec.modeling_data2vec import _compute_mask_indices
         >>> from datasets import load_dataset
         >>> import soundfile as sf
 
@@ -1333,7 +1330,7 @@ def forward(
 
 @add_start_docstrings(
     """Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
-    DATA_2_VEC_AUDIO_START_DOCSTRING,
+    DATA2VEC_AUDIO_START_DOCSTRING,
 )
 class Data2VecAudioForCTC(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
@@ -1361,7 +1358,7 @@ def freeze_feature_encoder(self):
         """
         self.data2vec_audio.feature_extractor._freeze_parameters()
 
-    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1445,10 +1442,10 @@ def forward(
 
 @add_start_docstrings(
     """
-    Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
-    SUPERB Keyword Spotting.
+    Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks
+    like SUPERB Keyword Spotting.
     """,
-    DATA_2_VEC_AUDIO_START_DOCSTRING,
+    DATA2VEC_AUDIO_START_DOCSTRING,
 )
 class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
@@ -1479,7 +1476,7 @@ def freeze_base_model(self):
         for param in self.data2vec_audio.parameters():
             param.requires_grad = False
 
-    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_SEQ_CLASS_CHECKPOINT,
@@ -1555,7 +1552,7 @@ def forward(
     """
     Data2VecAudio Model with a frame classification head on top for tasks like Speaker Diarization.
     """,
-    DATA_2_VEC_AUDIO_START_DOCSTRING,
+    DATA2VEC_AUDIO_START_DOCSTRING,
 )
 class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
@@ -1584,7 +1581,7 @@ def freeze_base_model(self):
         for param in self.data2vec_audio.parameters():
             param.requires_grad = False
 
-    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_FRAME_CLASS_CHECKPOINT,
@@ -1694,7 +1691,7 @@ def forward(self, hidden_states):
     """
     Data2VecAudio Model with an XVector feature extraction head on top for tasks like Speaker Verification.
     """,
-    DATA_2_VEC_AUDIO_START_DOCSTRING,
+    DATA2VEC_AUDIO_START_DOCSTRING,
 )
 class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
@@ -1746,7 +1743,7 @@ def _conv_out_length(input_length, kernel_size, stride):
 
         return input_lengths
 
-    @add_start_docstrings_to_model_forward(DATA_2_VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_XVECTOR_CHECKPOINT,
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 75ba37c2a4152..6bc514bcc0dbe 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -586,7 +586,6 @@ def forward(self, hidden_states):
         return pooled_output
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->Data2VecText, roberta->data2vec-text
 class Data2VecTextPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -1259,8 +1258,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    Data2VecText Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
-    softmax) e.g. for RocStories/SWAG tasks.
+    Data2VecText Model with a multiple choice classification head on top (a linear layer on top of the pooled output
+    and a softmax) e.g. for RocStories/SWAG tasks.
     """,
     DATA2VECTEXT_START_DOCSTRING,
 )
@@ -1354,8 +1353,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    Data2VecText Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
+    Data2VecText Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
     """,
     DATA2VECTEXT_START_DOCSTRING,
 )
@@ -1462,8 +1461,8 @@ def forward(self, features, **kwargs):
 
 @add_start_docstrings(
     """
-    Data2VecText Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    Data2VecText Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
     """,
     DATA2VECTEXT_START_DOCSTRING,
 )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 97920e04e7f67..1a96ceeb2ef3d 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1214,63 +1214,105 @@ def __init__(self, *args, **kwargs):
 DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class Data2VecForAudioModel(metaclass=DummyObject):
+class Data2VecAudioForAudioFrameClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForCausalLM(metaclass=DummyObject):
+class Data2VecAudioForCTC(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForMaskedLM(metaclass=DummyObject):
+class Data2VecAudioForPreTraining(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForMultipleChoice(metaclass=DummyObject):
+class Data2VecAudioForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForQuestionAnswering(metaclass=DummyObject):
+class Data2VecAudioForXVector(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForSequenceClassification(metaclass=DummyObject):
+class Data2VecAudioModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForTextModel(metaclass=DummyObject):
+class Data2VecAudioPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecForTokenClassification(metaclass=DummyObject):
+class Data2VecTextForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecPreTrainedModel(metaclass=DummyObject):
+class Data2VecTextForMaskedLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecTextForMultipleChoice(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecTextForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecTextForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecTextForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecTextModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Data2VecTextPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/test_modeling_data2vec.py b/tests/test_modeling_data2vec.py
index 3426ed9b86507..03d97f222fd8d 100644
--- a/tests/test_modeling_data2vec.py
+++ b/tests/test_modeling_data2vec.py
@@ -12,522 +12,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
-import unittest
-
-from transformers import Data2VecConfig, is_torch_available
-from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
-
-from .test_configuration_common import ConfigTester
-from .test_generation_utils import GenerationTesterMixin
-from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        Data2VecForCausalLM,
-        Data2VecForMaskedLM,
-        Data2VecForMultipleChoice,
-        Data2VecForQuestionAnswering,
-        Data2VecForSequenceClassification,
-        Data2VecForTextModel,
-        Data2VecForTokenClassification,
-    )
-    from transformers.models.data2vec.modeling_data2vec import (
-        DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
-        Data2VecForTextEmbeddings,
-        create_position_ids_from_input_ids,
-    )
-
-
-class Data2VecModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 5
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return Data2VecConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecForTextModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Data2VecForTextModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Data2VecForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Data2VecForCausalLM(config=config).to(torch_device).eval()
-
-        # make sure that ids don't start with pad token
-        mask = input_ids.ne(config.pad_token_id).long()
-        input_ids = input_ids * mask
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-
-        # make sure that ids don't start with pad token
-        mask = next_tokens.ne(config.pad_token_id).long()
-        next_tokens = next_tokens * mask
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = Data2VecForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = Data2VecForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class Data2VecModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (
-            Data2VecForCausalLM,
-            Data2VecForMaskedLM,
-            Data2VecForTextModel,
-            Data2VecForSequenceClassification,
-            Data2VecForTokenClassification,
-            Data2VecForMultipleChoice,
-            Data2VecForQuestionAnswering,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (Data2VecForCausalLM,) if is_torch_available() else ()
-
-    def setUp(self):
-        self.model_tester = Data2VecModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Data2VecConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = Data2VecForTextModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    def test_create_position_ids_respects_padding_index(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecForTextEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = Data2VecForTextEmbeddings(config=config)
-
-        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = torch.as_tensor(
-            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
-        )
-
-        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-    def test_create_position_ids_from_inputs_embeds(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecForTextEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = Data2VecForTextEmbeddings(config=config)
-
-        inputs_embeds = torch.empty(2, 4, 30)
-        expected_single_positions = [
-            0 + embeddings.padding_idx + 1,
-            1 + embeddings.padding_idx + 1,
-            2 + embeddings.padding_idx + 1,
-            3 + embeddings.padding_idx + 1,
-        ]
-        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
-        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-
-@require_torch
-class Data2VecModelIntegrationTest(TestCasePlus):
-    @slow
-    def test_inference_masked_lm(self):
-        model = Data2VecForMaskedLM.from_pretrained("data2vec")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
-        )
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
-        # data2vec.eval()
-        # expected_slice = data2vec.model.forward(input_ids)[0][:, :3, :3].detach()
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_no_head(self):
-        model = Data2VecForTextModel.from_pretrained("data2vec")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
-        )
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
-        # data2vec.eval()
-        # expected_slice = data2vec.extract_features(input_ids)[:, :3, :3].detach()
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_classification_head(self):
-        model = Data2VecForSequenceClassification.from_pretrained("data2vec-large-mnli")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 3))
-        self.assertEqual(output.shape, expected_shape)
-        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.large.mnli')
-        # data2vec.eval()
-        # expected_tensor = data2vec.predict("mnli", input_ids, return_logits=True).detach()
-
-        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
diff --git a/tests/test_modeling_data2vec_audio.py b/tests/test_modeling_data2vec_audio.py
new file mode 100644
index 0000000000000..03d97f222fd8d
--- /dev/null
+++ b/tests/test_modeling_data2vec_audio.py
@@ -0,0 +1,14 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/test_modeling_data2vec_text.py b/tests/test_modeling_data2vec_text.py
new file mode 100644
index 0000000000000..ffdc88fbd8f74
--- /dev/null
+++ b/tests/test_modeling_data2vec_text.py
@@ -0,0 +1,534 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Data2VecText model. """
+
+
+import unittest
+
+from transformers import Data2VecTextConfig, is_torch_available
+from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Data2VecTextForCausalLM,
+        Data2VecTextForMaskedLM,
+        Data2VecTextForMultipleChoice,
+        Data2VecTextForQuestionAnswering,
+        Data2VecTextForSequenceClassification,
+        Data2VecTextForTokenClassification,
+        Data2VecTextModel,
+    )
+    from transformers.models.data2vec.modeling_data2vec import (
+        DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
+        Data2VecTextForTextEmbeddings,
+        create_position_ids_from_input_ids,
+    )
+
+
+class Data2VecModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return Data2VecTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = Data2VecTextModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = Data2VecTextForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = Data2VecTextForCausalLM(config=config).to(torch_device).eval()
+
+        # make sure that ids don't start with pad token
+        mask = input_ids.ne(config.pad_token_id).long()
+        input_ids = input_ids * mask
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+
+        # make sure that ids don't start with pad token
+        mask = next_tokens.ne(config.pad_token_id).long()
+        next_tokens = next_tokens * mask
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = Data2VecTextForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = Data2VecTextForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Data2VecModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            Data2VecTextForCausalLM,
+            Data2VecTextForMaskedLM,
+            Data2VecTextModel,
+            Data2VecTextForSequenceClassification,
+            Data2VecTextForTokenClassification,
+            Data2VecTextForMultipleChoice,
+            Data2VecTextForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (Data2VecTextForCausalLM,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = Data2VecModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Data2VecTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = Data2VecTextModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_create_position_ids_respects_padding_index(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = Data2VecTextForTextEmbeddings(config=config)
+
+        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
+        expected_positions = torch.as_tensor(
+            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
+        )
+
+        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+    def test_create_position_ids_from_inputs_embeds(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        embeddings = Data2VecTextForTextEmbeddings(config=config)
+
+        inputs_embeds = torch.empty(2, 4, 30)
+        expected_single_positions = [
+            0 + embeddings.padding_idx + 1,
+            1 + embeddings.padding_idx + 1,
+            2 + embeddings.padding_idx + 1,
+            3 + embeddings.padding_idx + 1,
+        ]
+        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
+        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+
+@require_torch
+class Data2VecModelIntegrationTest(TestCasePlus):
+    @slow
+    def test_inference_masked_lm(self):
+        model = Data2VecTextForMaskedLM.from_pretrained("data2vec")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 50265))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
+        )
+
+        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
+        # data2vec.eval()
+        # expected_slice = data2vec.model.forward(input_ids)[0][:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_no_head(self):
+        model = Data2VecTextModel.from_pretrained("data2vec")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
+        )
+
+        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
+        # data2vec.eval()
+        # expected_slice = data2vec.extract_features(input_ids)[:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_classification_head(self):
+        model = Data2VecTextForSequenceClassification.from_pretrained("data2vec-large-mnli")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 3))
+        self.assertEqual(output.shape, expected_shape)
+        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
+
+        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.large.mnli')
+        # data2vec.eval()
+        # expected_tensor = data2vec.predict("mnli", input_ids, return_logits=True).detach()
+
+        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 798273fe7c572..3391c6bfbbd1c 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -88,7 +88,7 @@
     "TFRobertaForMultipleChoice",  # TODO: fix
     "TrOCRDecoderWrapper",  # Building part of bigger (tested) model.
     "SeparableConv1D",  # Building part of bigger (tested) model.
-    "Data2VecForAudioModel",  # TODO: Add tests
+    "Data2VecAudioModel",  # TODO: Add tests
 ]
 
 # Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't
@@ -169,7 +169,8 @@
     "VisualBertForMultipleChoice",
     "TFWav2Vec2ForCTC",
     "TFHubertForCTC",
-    "Data2VecForAudioModel",  # TODO: Fix
+    "Data2VecAudioModel",  # TODO: Fix
+    "Data2VecAudioForPreTraining",  # TODO: Fix
 ]
 
 # This is to make sure the transformers module imported is the one in the repo.
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 78fb572e16e6f..6c912141d36f0 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -373,6 +373,7 @@ def module_to_test_file(module_fname):
     "tests/sagemaker/test_single_node_gpu.py",  # SageMaker test
     "tests/sagemaker/test_multi_node_model_parallel.py",  # SageMaker test
     "tests/sagemaker/test_multi_node_data_parallel.py",  # SageMaker test
+    "tests/test_modeling_data2vec.py",  # TODO:Remove redundant file
 ]
 
 

From bbd3846c19809f5bf04a73717e09d9169476f1d7 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Tue, 22 Feb 2022 02:27:41 -0500
Subject: [PATCH 094/121] Remove Data2VecAudioForPretraining. Add tests for
 Data2VecAudio, mimicking the Wav2Vec2 test suite. Fix bias initilization in
 positional conv layers. Move back configurations for audio and text to
 separate files.

---
 docs/source/model_doc/data2vec.mdx            |    5 -
 src/transformers/__init__.py                  |    6 +-
 src/transformers/models/data2vec/__init__.py  |   15 +-
 .../models/data2vec/configuration_data2vec.py |  437 ------
 .../data2vec/configuration_data2vec_audio.py  |  320 +++++
 .../data2vec/configuration_data2vec_text.py   |  147 ++
 ..._original_pytorch_checkpoint_to_pytorch.py |    2 +-
 .../data2vec/modeling_data2vec_audio.py       |  254 +---
 .../models/data2vec/modeling_data2vec_text.py |    2 +-
 src/transformers/utils/dummy_pt_objects.py    |    7 -
 tests/test_modeling_data2vec_audio.py         | 1273 +++++++++++++++++
 utils/check_repo.py                           |    2 -
 12 files changed, 1753 insertions(+), 717 deletions(-)
 create mode 100644 src/transformers/models/data2vec/configuration_data2vec_audio.py
 create mode 100644 src/transformers/models/data2vec/configuration_data2vec_text.py

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
index 6474ceb4758ca..6c682e03858f7 100644
--- a/docs/source/model_doc/data2vec.mdx
+++ b/docs/source/model_doc/data2vec.mdx
@@ -64,11 +64,6 @@ The original code can be found [here](https://github.com/pytorch/fairseq/tree/ma
 [[autodoc]] Data2VecAudioForCTC
     - forward
 
-## Data2VecAudioForPreTraining
-
-[[autodoc]] Data2VecAudioForPreTraining
-    - forward
-
 ## Data2VecAudioForSequenceClassification
 
 [[autodoc]] Data2VecAudioForSequenceClassification
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3d1ab30f10251..433769ca2f640 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -204,7 +204,7 @@
     "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
     "models.cpm": ["CpmTokenizer"],
     "models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
-    "models.data2vec": ["DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecTextConfig"],
+    "models.data2vec": ["DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecAudioConfig", "Data2VecTextConfig"],
     "models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"],
     "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
     "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
@@ -877,7 +877,6 @@
             "DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
             "Data2VecAudioForAudioFrameClassification",
             "Data2VecAudioForCTC",
-            "Data2VecAudioForPreTraining",
             "Data2VecAudioForSequenceClassification",
             "Data2VecAudioForXVector",
             "Data2VecAudioModel",
@@ -2481,7 +2480,7 @@
     from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
     from .models.cpm import CpmTokenizer
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
-    from .models.data2vec import DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecTextConfig
+    from .models.data2vec import DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecAudioConfig, Data2VecTextConfig
     from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer
     from .models.deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
@@ -3039,7 +3038,6 @@
             DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
             Data2VecAudioForAudioFrameClassification,
             Data2VecAudioForCTC,
-            Data2VecAudioForPreTraining,
             Data2VecAudioForSequenceClassification,
             Data2VecAudioForXVector,
             Data2VecAudioModel,
diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
index 8abc7d3def191..d67c90a2d1314 100644
--- a/src/transformers/models/data2vec/__init__.py
+++ b/src/transformers/models/data2vec/__init__.py
@@ -22,11 +22,12 @@
 
 
 _import_structure = {
-    "configuration_data2vec": [
+    "configuration_data2vec_audio": [
         "DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Data2VecAudioConfig",
-        "Data2VecAudioOnnxConfig",
+    ],
+    "configuration_data2vec_text": [
+        "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Data2VecTextConfig",
         "Data2VecTextOnnxConfig",
     ],
@@ -37,7 +38,6 @@
         "DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST",
         "Data2VecAudioForAudioFrameClassification",
         "Data2VecAudioForCTC",
-        "Data2VecAudioForPreTraining",
         "Data2VecAudioForSequenceClassification",
         "Data2VecAudioForXVector",
         "Data2VecAudioModel",
@@ -56,11 +56,9 @@
     ]
 
 if TYPE_CHECKING:
-    from .configuration_data2vec import (
-        DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    from .configuration_data2vec_audio import DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecAudioConfig
+    from .configuration_data2vec_text import (
         DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        Data2VecAudioConfig,
-        Data2VecAudioOnnxConfig,
         Data2VecTextConfig,
         Data2VecTextOnnxConfig,
     )
@@ -70,7 +68,6 @@
             DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST,
             Data2VecAudioForAudioFrameClassification,
             Data2VecAudioForCTC,
-            Data2VecAudioForPreTraining,
             Data2VecAudioForSequenceClassification,
             Data2VecAudioForXVector,
             Data2VecAudioModel,
diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
index 4958f60135aff..b0b4464ad1170 100644
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ b/src/transformers/models/data2vec/configuration_data2vec.py
@@ -12,440 +12,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Data2VecText configuration"""
-from collections import OrderedDict
-from typing import Mapping
-
-from ...configuration_utils import PretrainedConfig
-from ...onnx import OnnxConfig
-from ...utils import logging
-import math
-
-
-logger = logging.get_logger(__name__)
-
-DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "edugp/data2vec-nlp-base": "https://huggingface.co/data2vec/resolve/main/config.json",
-}
-
-DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/data2vec-base-960h": "https://huggingface.co/facebook/data2vec-audio-base-960h/resolve/main/config.json",
-    # See all Data2VecAudio models at https://huggingface.co/models?filter=data2vec-audio
-}
-
-
-class Data2VecTextConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Data2VecTextModel`] and [`Data2VecTextModel`]. It
-    is used to instantiate a Data2VecText model according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2VecText
-    [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 30522):
-            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"silu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the attention probabilities.
-        max_position_embeddings (`int`, *optional*, defaults to 512):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the layer normalization layers.
-        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
-            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
-            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
-            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
-            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
-            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        classifier_dropout (`float`, *optional*):
-            The dropout ratio for the classification head.
-
-    Examples:
-
-    ```python
-    >>> from transformers import Data2VecTextModel, Data2VecTextConfig
-
-    >>> # Initializing a Data2VecText edugp/data2vec-nlp-base style configuration
-    >>> configuration = Data2VecTextConfig()
-
-    >>> # Initializing a model from the edugp/data2vec-nlp-base style configuration
-    >>> model = Data2VecTextModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "data2vec_text"
-
-    def __init__(
-        self,
-        vocab_size=30522,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
-        position_embedding_type="absolute",
-        use_cache=True,
-        classifier_dropout=None,
-        **kwargs
-    ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.position_embedding_type = position_embedding_type
-        self.use_cache = use_cache
-        self.classifier_dropout = classifier_dropout
-
-
-class Data2VecTextOnnxConfig(OnnxConfig):
-    @property
-    def inputs(self) -> Mapping[str, Mapping[int, str]]:
-        return OrderedDict(
-            [
-                ("input_ids", {0: "batch", 1: "sequence"}),
-                ("attention_mask", {0: "batch", 1: "sequence"}),
-            ]
-        )
-
-
-class Data2VecAudioConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Data2VecAudioModel`]. It is used to instantiate
-    an Data2VecAudio model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the Data2VecAudio
-    [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h) architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32):
-            Vocabulary size of the Data2VecAudio model. Defines the number of different tokens that can be represented
-            by the `inputs_ids` passed when calling [`Data2VecAudioModel`] or [`TFData2VecAudioModel`]. Vocabulary size
-            of the model. Defines the different tokens that can be represented by the *inputs_ids* passed to the
-            forward method of [`Data2VecAudioModel`].
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the attention probabilities.
-        final_dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability for the final projection layer of [`Data2VecAudioForCTC`].
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the layer normalization layers.
-        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
-            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
-            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
-            convolutional layers.
-        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probability for output of the feature encoder.
-        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
-            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for quantized feature encoder states.
-        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
-            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
-            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
-        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
-            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
-            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
-        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
-            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
-            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
-            *conv_dim*.
-        conv_bias (`bool`, *optional*, defaults to `False`):
-            Whether the 1D convolutional layers have a bias.
-        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
-            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
-            embeddings layer.
-        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
-            Number of groups of 1D convolutional positional embeddings layer.
-        mask_time_prob (`float`, *optional*, defaults to 0.05):
-            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
-            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
-            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
-        mask_time_length (`int`, *optional*, defaults to 10):
-            Length of vector span along the time axis.
-        mask_time_min_masks (`int`, *optional*, defaults to 2),:
-            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
-            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
-            mask_time_min_masks''
-        mask_feature_prob (`float`, *optional*, defaults to 0.0):
-            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
-            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
-            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
-            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
-            True`.
-        mask_feature_length (`int`, *optional*, defaults to 10):
-            Length of vector span along the feature axis.
-        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
-            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
-            step, irrespectively of `mask_feature_prob`. Only relevant if
-            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
-        num_codevectors_per_group (`int`, *optional*, defaults to 320):
-            Number of entries in each quantization codebook (group).
-        num_codevector_groups (`int`, *optional*, defaults to 2):
-            Number of codevector groups for product codevector quantization.
-        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
-            The temperature *kappa* in the contrastive loss.
-        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
-        num_negatives (`int`, *optional*, defaults to 100):
-            Number of negative samples for the contrastive loss.
-        codevector_dim (`int`, *optional*, defaults to 256):
-            Dimensionality of the quantized feature vectors.
-        proj_codevector_dim (`int`, *optional*, defaults to 256):
-            Dimensionality of the final projection of both the quantized and the transformer features.
-        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
-            The weight of the codebook diversity loss component.
-        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
-            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
-            instance of [`Data2VecAudioForCTC`].
-        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
-            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
-            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
-            of [`Data2VecAudioForCTC`].
-        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
-            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
-            instance of [`Data2VecAudioForSequenceClassification`].
-        classifier_proj_size (`int`, *optional*, defaults to 256):
-            Dimensionality of the projection before token mean-pooling for classification.
-        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
-            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
-            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
-        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
-            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
-            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
-        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
-            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
-            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
-        xvector_output_dim (`int`, *optional*, defaults to 512):
-            Dimensionality of the *XVector* embedding vectors.
-        add_adapter (`bool`, *optional*, defaults to `False`):
-            Whether a convolutional network should be stacked on top of the Data2VecAudio Encoder. Can be very useful
-            for warm-starting Data2VecAudio for SpeechEncoderDecoder models.
-        adapter_kernel_size (`int`, *optional*, defaults to 3):
-            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
-        adapter_stride (`int`, *optional*, defaults to 2):
-            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
-        num_adapter_layers (`int`, *optional*, defaults to 3):
-            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
-            True`.
-        output_hidden_size (`int`, *optional*):
-            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
-            if `add_adapter is True`.
-
-    Example:
-
-    ```python
-    >>> from transformers import Data2VecAudioModel, Data2VecAudioConfig
-
-    >>> # Initializing a Data2VecAudio facebook/wav2vec2-base-960h style configuration
-    >>> configuration = Data2VecAudioConfig()
-
-    >>> # Initializing a model from the facebook/wav2vec2-base-960h style configuration
-    >>> model = Data2VecAudioModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "data2vec-audio"
-
-    def __init__(
-        self,
-        vocab_size=32,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout=0.1,
-        activation_dropout=0.1,
-        attention_dropout=0.1,
-        feat_proj_dropout=0.0,
-        feat_quantizer_dropout=0.0,
-        final_dropout=0.1,
-        layerdrop=0.1,
-        initializer_range=0.02,
-        layer_norm_eps=1e-5,
-        feat_extract_norm="group",
-        feat_extract_activation="gelu",
-        conv_dim=(512, 512, 512, 512, 512, 512, 512),
-        conv_stride=(5, 2, 2, 2, 2, 2, 2),
-        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
-        conv_bias=False,
-        num_conv_pos_embedding_groups=16,
-        conv_pos_kernel_size=19,
-        num_conv_pos_embeddings=5,
-        mask_time_prob=0.05,
-        mask_time_length=10,
-        mask_time_min_masks=2,
-        mask_feature_prob=0.0,
-        mask_feature_length=10,
-        mask_feature_min_masks=0,
-        num_codevectors_per_group=320,
-        num_codevector_groups=2,
-        contrastive_logits_temperature=0.1,
-        num_negatives=100,
-        codevector_dim=256,
-        proj_codevector_dim=256,
-        diversity_loss_weight=0.1,
-        ctc_loss_reduction="sum",
-        ctc_zero_infinity=False,
-        use_weighted_layer_sum=False,
-        classifier_proj_size=256,
-        tdnn_dim=(512, 512, 512, 512, 1500),
-        tdnn_kernel=(5, 3, 3, 1, 1),
-        tdnn_dilation=(1, 2, 3, 1, 1),
-        xvector_output_dim=512,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        add_adapter=False,
-        adapter_kernel_size=3,
-        adapter_stride=2,
-        num_adapter_layers=3,
-        output_hidden_size=None,
-        **kwargs
-    ):
-        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
-        self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = list(conv_dim)
-        self.conv_stride = list(conv_stride)
-        self.conv_kernel = list(conv_kernel)
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.conv_pos_kernel_size = conv_pos_kernel_size
-        self.num_feat_extract_layers = len(self.conv_dim)
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout = hidden_dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
-        self.feat_proj_dropout = feat_proj_dropout
-        self.final_dropout = final_dropout
-        self.layerdrop = layerdrop
-        self.layer_norm_eps = layer_norm_eps
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.use_weighted_layer_sum = use_weighted_layer_sum
-
-        if (
-            (len(self.conv_stride) != self.num_feat_extract_layers)
-            or (len(self.conv_kernel) != self.num_feat_extract_layers)
-            or (len(self.conv_dim) != self.num_feat_extract_layers)
-        ):
-            raise ValueError(
-                "Configuration for convolutional layers is incorrect. "
-                "It is required that `len(config.conv_dim)` == `len(config.conv_stride)` == `len(config.conv_kernel)`, "
-                f"but is `len(config.conv_dim) = {len(self.conv_dim)}`, `len(config.conv_stride) "
-                f"= {len(self.conv_stride)}`, `len(config.conv_kernel) = {len(self.conv_kernel)}`."
-            )
-
-        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
-        self.mask_time_prob = mask_time_prob
-        self.mask_time_length = mask_time_length
-        self.mask_time_min_masks = mask_time_min_masks
-        self.mask_feature_prob = mask_feature_prob
-        self.mask_feature_length = mask_feature_length
-        self.mask_feature_min_masks = mask_feature_min_masks
-
-        # parameters for pretraining with codevector quantized representations
-        self.num_codevectors_per_group = num_codevectors_per_group
-        self.num_codevector_groups = num_codevector_groups
-        self.contrastive_logits_temperature = contrastive_logits_temperature
-        self.feat_quantizer_dropout = feat_quantizer_dropout
-        self.num_negatives = num_negatives
-        self.codevector_dim = codevector_dim
-        self.proj_codevector_dim = proj_codevector_dim
-        self.diversity_loss_weight = diversity_loss_weight
-
-        # ctc loss
-        self.ctc_loss_reduction = ctc_loss_reduction
-        self.ctc_zero_infinity = ctc_zero_infinity
-
-        # adapter
-        self.add_adapter = add_adapter
-        self.adapter_kernel_size = adapter_kernel_size
-        self.adapter_stride = adapter_stride
-        self.num_adapter_layers = num_adapter_layers
-        self.output_hidden_size = output_hidden_size or hidden_size
-
-        # SequenceClassification-specific parameter. Feel free to ignore for other classes.
-        self.classifier_proj_size = classifier_proj_size
-
-        # XVector-specific parameters. Feel free to ignore for other classes.
-        self.tdnn_dim = list(tdnn_dim)
-        self.tdnn_kernel = list(tdnn_kernel)
-        self.tdnn_dilation = list(tdnn_dilation)
-        self.xvector_output_dim = xvector_output_dim
-
-    @property
-    def inputs_to_logits_ratio(self):
-        return math.prod(self.conv_stride)
diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py
new file mode 100644
index 0000000000000..c6e5780e86294
--- /dev/null
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -0,0 +1,320 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Data2VecText configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/data2vec-base-960h": "https://huggingface.co/facebook/data2vec-audio-base-960h/resolve/main/config.json",
+    # See all Data2VecAudio models at https://huggingface.co/models?filter=data2vec-audio
+}
+
+
+class Data2VecAudioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Data2VecAudioModel`]. It is used to instantiate
+    an Data2VecAudio model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Data2VecAudio
+    [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32):
+            Vocabulary size of the Data2VecAudio model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`Data2VecAudioModel`] or [`TFData2VecAudioModel`]. Vocabulary size
+            of the model. Defines the different tokens that can be represented by the *inputs_ids* passed to the
+            forward method of [`Data2VecAudioModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`Data2VecAudioForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for quantized feature encoder states.
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
+            Number of entries in each quantization codebook (group).
+        num_codevector_groups (`int`, *optional*, defaults to 2):
+            Number of codevector groups for product codevector quantization.
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
+        num_negatives (`int`, *optional*, defaults to 100):
+            Number of negative samples for the contrastive loss.
+        codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the quantized feature vectors.
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the final projection of both the quantized and the transformer features.
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
+            The weight of the codebook diversity loss component.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`Data2VecAudioForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`Data2VecAudioForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`Data2VecAudioForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification.
+        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        add_adapter (`bool`, *optional*, defaults to `False`):
+            Whether a convolutional network should be stacked on top of the Data2VecAudio Encoder. Can be very useful
+            for warm-starting Data2VecAudio for SpeechEncoderDecoder models.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.
+
+    Example:
+
+    ```python
+    >>> from transformers import Data2VecAudioModel, Data2VecAudioConfig
+
+    >>> # Initializing a Data2VecAudio facebook/wav2vec2-base-960h style configuration
+    >>> configuration = Data2VecAudioConfig()
+
+    >>> # Initializing a model from the facebook/wav2vec2-base-960h style configuration
+    >>> model = Data2VecAudioModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "data2vec-audio"
+
+    def __init__(
+        self,
+        vocab_size=32,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        activation_dropout=0.1,
+        attention_dropout=0.1,
+        feat_proj_dropout=0.0,
+        feat_quantizer_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        feat_extract_norm="group",
+        feat_extract_activation="gelu",
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        num_conv_pos_embedding_groups=16,
+        conv_pos_kernel_size=19,
+        num_conv_pos_embeddings=5,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        num_codevectors_per_group=320,
+        num_codevector_groups=2,
+        contrastive_logits_temperature=0.1,
+        num_negatives=100,
+        codevector_dim=256,
+        proj_codevector_dim=256,
+        diversity_loss_weight=0.1,
+        ctc_loss_reduction="sum",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        tdnn_dim=(512, 512, 512, 512, 1500),
+        tdnn_kernel=(5, 3, 3, 1, 1),
+        tdnn_dilation=(1, 2, 3, 1, 1),
+        xvector_output_dim=512,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        add_adapter=False,
+        adapter_kernel_size=3,
+        adapter_stride=2,
+        num_adapter_layers=3,
+        output_hidden_size=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.conv_pos_kernel_size = conv_pos_kernel_size
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. "
+                "It is required that `len(config.conv_dim)` == `len(config.conv_stride)` == `len(config.conv_kernel)`, "
+                f"but is `len(config.conv_dim) = {len(self.conv_dim)}`, `len(config.conv_stride) "
+                f"= {len(self.conv_stride)}`, `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        # parameters for pretraining with codevector quantized representations
+        self.num_codevectors_per_group = num_codevectors_per_group
+        self.num_codevector_groups = num_codevector_groups
+        self.contrastive_logits_temperature = contrastive_logits_temperature
+        self.feat_quantizer_dropout = feat_quantizer_dropout
+        self.num_negatives = num_negatives
+        self.codevector_dim = codevector_dim
+        self.proj_codevector_dim = proj_codevector_dim
+        self.diversity_loss_weight = diversity_loss_weight
+
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+        # adapter
+        self.add_adapter = add_adapter
+        self.adapter_kernel_size = adapter_kernel_size
+        self.adapter_stride = adapter_stride
+        self.num_adapter_layers = num_adapter_layers
+        self.output_hidden_size = output_hidden_size or hidden_size
+
+        # SequenceClassification-specific parameter. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+
+        # XVector-specific parameters. Feel free to ignore for other classes.
+        self.tdnn_dim = list(tdnn_dim)
+        self.tdnn_kernel = list(tdnn_kernel)
+        self.tdnn_dilation = list(tdnn_dilation)
+        self.xvector_output_dim = xvector_output_dim
diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py
new file mode 100644
index 0000000000000..2389afd215840
--- /dev/null
+++ b/src/transformers/models/data2vec/configuration_data2vec_text.py
@@ -0,0 +1,147 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Data2VecText configuration"""
+from collections import OrderedDict
+from typing import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "edugp/data2vec-nlp-base": "https://huggingface.co/data2vec/resolve/main/config.json",
+}
+
+
+class Data2VecTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Data2VecTextModel`] and [`Data2VecTextModel`]. It
+    is used to instantiate a Data2VecText model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2VecText
+    [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import Data2VecTextModel, Data2VecTextConfig
+
+    >>> # Initializing a Data2VecText edugp/data2vec-nlp-base style configuration
+    >>> configuration = Data2VecTextConfig()
+
+    >>> # Initializing a model from the edugp/data2vec-nlp-base style configuration
+    >>> model = Data2VecTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "data2vec_text"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+class Data2VecTextOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
diff --git a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
index 4ffdb27c082d8..e8a703de91f36 100644
--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@@ -24,7 +24,7 @@
 from datasets import load_dataset
 
 from transformers import Wav2Vec2Processor, logging
-from transformers.models.data2vec.configuration_data2vec import Data2VecAudioConfig
+from transformers.models.data2vec.configuration_data2vec_audio import Data2VecAudioConfig
 
 # Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
 from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel as Dummy  # noqa: F401
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 5fd4cd29688c5..aad099fa7dce9 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -31,12 +31,11 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
 )
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput
 from ...modeling_utils import PreTrainedModel, torch_int_div
 from ...utils import logging
-from .configuration_data2vec import Data2VecAudioConfig
+from .configuration_data2vec_audio import Data2VecAudioConfig
 
 
 logger = logging.get_logger(__name__)
@@ -109,48 +108,6 @@ class Data2VecAudioBaseModelOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-@dataclass
-class Data2VecAudioForPreTrainingOutput(ModelOutput):
-    """
-    Output type of [`Data2VecAudioForPreTraining`], with potential hidden states and attentions.
-
-    Args:
-        loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
-            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
-            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
-        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
-            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
-            projected quantized states.
-        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
-            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
-            target vectors for contrastive loss.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
-            The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
-        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
-            The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    projected_states: torch.FloatTensor = None
-    projected_quantized_states: torch.FloatTensor = None
-    codevector_perplexity: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    contrastive_loss: Optional[torch.FloatTensor] = None
-    diversity_loss: Optional[torch.FloatTensor] = None
-
-
 @dataclass
 class XVectorOutput(ModelOutput):
     """
@@ -862,6 +819,8 @@ def _init_weights(self, module):
             k = math.sqrt(1 / module.projection.in_features)
             nn.init.uniform_(module.projection.weight, a=-k, b=k)
             nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, Data2VecAudioPositionalConvLayer):
+            nn.init.constant_(module.conv.bias, 0)
         elif isinstance(module, nn.Linear):
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
 
@@ -1121,213 +1080,6 @@ def forward(
         )
 
 
-@add_start_docstrings("""Data2VecAudio Model with a quantizer and `VQ` head on top.""", DATA2VEC_AUDIO_START_DOCSTRING)
-class Data2VecAudioForPreTraining(Data2VecAudioPreTrainedModel):
-    def __init__(self, config: Data2VecAudioConfig):
-        super().__init__(config)
-        self.data2vec_audio = Data2VecAudioModel(config)
-        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        # make sure that project_hid & project_q are initialized like normal linear layers
-        self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
-        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
-
-    def set_gumbel_temperature(self, temperature: int):
-        """
-        Set the Gumbel softmax temperature to a given value. Only necessary for training
-        """
-        self.quantizer.temperature = temperature
-
-    def freeze_feature_encoder(self):
-        """
-        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
-        not be updated during training.
-        """
-        self.data2vec_audio.feature_extractor._freeze_parameters()
-
-    @staticmethod
-    def compute_contrastive_logits(
-        target_features: torch.FloatTensor,
-        negative_features: torch.FloatTensor,
-        predicted_features: torch.FloatTensor,
-        temperature: int = 0.1,
-    ):
-        """
-        Compute logits for contrastive loss based using cosine similarity as the distance measure between
-        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
-        """
-        target_features = torch.cat([target_features, negative_features], dim=0)
-
-        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(
-            target_features
-        )
-
-        # apply temperature
-        logits = logits / temperature
-        return logits
-
-    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Data2VecAudioForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_values,
-        attention_mask=None,
-        mask_time_indices=None,
-        sampled_negative_indices=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
-            masked extracted features in *config.proj_codevector_dim* space.
-        sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
-            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
-            Required input for pre-training.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> import torch
-        >>> from transformers import Wav2Vec2FeatureExtractor, Data2VecAudioForPreTraining
-        >>> from transformers.models.data2vec.modeling_data2vec import _compute_mask_indices
-        >>> from datasets import load_dataset
-        >>> import soundfile as sf
-
-        >>> feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("patrickvonplaten/data2vec-audio-base")
-        >>> model = Data2VecAudioForPreTraining.from_pretrained("patrickvonplaten/data2vec-audio-base")
-
-
-        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
-        ...     return batch
-
-
-        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        >>> ds = ds.map(map_to_array)
-
-        >>> input_values = feature_extractor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
-
-        >>> # compute masked indices
-        >>> batch_size, raw_sequence_length = input_values.shape
-        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
-        >>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)
-        >>> mask_time_indices = torch.tensor(mask_time_indices, device=input_values.device, dtype=torch.long)
-
-        >>> with torch.no_grad():
-        ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
-
-        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
-        >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
-
-        >>> # show that cosine similarity is much higher than random
-        >>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5
-        tensor(True)
-
-        >>> # for contrastive loss training model should be put into train mode
-        >>> model = model.train()
-        >>> loss = model(input_values, mask_time_indices=mask_time_indices).loss
-        ```"""
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if mask_time_indices is not None:
-            mask_time_indices = mask_time_indices.to(torch.bool)
-
-        outputs = self.data2vec_audio(
-            input_values,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            mask_time_indices=mask_time_indices,
-            return_dict=return_dict,
-        )
-
-        # 1. project all transformed features (including masked) to final vq dim
-        transformer_features = self.project_hid(outputs[0])
-
-        # 2. quantize all (unmasked) extracted features and project to final vq dim
-        extract_features = self.dropout_features(outputs[1])
-
-        if attention_mask is not None:
-            # compute reduced attention_mask correponding to feature vectors
-            attention_mask = self._get_feature_vector_attention_mask(
-                extract_features.shape[1], attention_mask, add_adapter=False
-            )
-
-        quantized_features, codevector_perplexity = self.quantizer(
-            extract_features, mask_time_indices=mask_time_indices
-        )
-        quantized_features = self.project_q(quantized_features)
-
-        loss = contrastive_loss = diversity_loss = None
-        if sampled_negative_indices is not None:
-            batch_size, sequence_length, hidden_size = quantized_features.shape
-
-            # for training, we sample negatives
-            # 3. sample K negatives (distractors) quantized states for contrastive loss
-            # if attention_mask is passed, make sure that padded feature vectors cannot be sampled
-            # sample negative quantized vectors BTC => (BxT)C
-            negative_quantized_features = quantized_features.view(-1, hidden_size)[
-                sampled_negative_indices.long().view(-1)
-            ]
-            negative_quantized_features = negative_quantized_features.view(
-                batch_size, sequence_length, -1, hidden_size
-            ).permute(2, 0, 1, 3)
-
-            # 4. compute logits, corresponding to `logs = sim(c_t, [q_t, \sim{q}_t]) / \kappa`
-            # of equation (3) in https://arxiv.org/pdf/2006.11477.pdf
-            logits = self.compute_contrastive_logits(
-                quantized_features[None, :],
-                negative_quantized_features,
-                transformer_features,
-                self.config.contrastive_logits_temperature,
-            )
-
-            # 5. if a negative vector is identical to the positive (i.e. when codebook utilization is low),
-            # its cosine similarity will be masked
-            neg_is_pos = (quantized_features == negative_quantized_features).all(-1)
-
-            if neg_is_pos.any():
-                logits[1:][neg_is_pos] = float("-inf")
-
-            # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) =
-            # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
-            logits = logits.transpose(0, 2).reshape(-1, logits.size(0))
-            target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten()
-
-            contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum")
-            # 7. compute diversity loss: \mathbf{L}_d
-            num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups
-            diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum()
-
-            # 8. \mathbf{L} = \mathbf{L}_m + \alpha * \mathbf{L}_d
-            loss = contrastive_loss + self.config.diversity_loss_weight * diversity_loss
-
-        if not return_dict:
-            if loss is not None:
-                return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
-            return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
-
-        return Data2VecAudioForPreTrainingOutput(
-            loss=loss,
-            projected_states=transformer_features,
-            projected_quantized_states=quantized_features,
-            codevector_perplexity=codevector_perplexity,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            contrastive_loss=contrastive_loss,
-            diversity_loss=diversity_loss,
-        )
-
-
 @add_start_docstrings(
     """Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     DATA2VEC_AUDIO_START_DOCSTRING,
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 6bc514bcc0dbe..4db194150bdb6 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -46,7 +46,7 @@
     prune_linear_layer,
 )
 from ...utils import logging
-from .configuration_data2vec import Data2VecTextConfig
+from .configuration_data2vec_text import Data2VecTextConfig
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 1a96ceeb2ef3d..3bce039f60cfb 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1228,13 +1228,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Data2VecAudioForPreTraining(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class Data2VecAudioForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/test_modeling_data2vec_audio.py b/tests/test_modeling_data2vec_audio.py
index 03d97f222fd8d..8a39f3d11e789 100644
--- a/tests/test_modeling_data2vec_audio.py
+++ b/tests/test_modeling_data2vec_audio.py
@@ -12,3 +12,1276 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" Testing suite for the PyTorch Data2VecAudio model. """
+
+import math
+import unittest
+
+import numpy as np
+from datasets import load_dataset
+
+from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
+from transformers import Data2VecAudioConfig, is_torch_available
+from transformers.testing_utils import is_pt_flax_cross_test, require_soundfile, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, _config_zero_init
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Data2VecAudioForAudioFrameClassification,
+        Data2VecAudioForCTC,
+        Data2VecAudioForSequenceClassification,
+        Data2VecAudioForXVector,
+        Data2VecAudioModel,
+        Wav2Vec2FeatureExtractor,
+        Wav2Vec2Processor,
+    )
+    from transformers.models.wav2vec2.modeling_wav2vec2 import (
+        Wav2Vec2GumbelVectorQuantizer,
+        _compute_mask_indices,
+        _sample_negative_indices,
+    )
+
+
+class Data2VecAudioModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=1024,  # speech is longer
+        is_training=False,
+        hidden_size=16,
+        feat_extract_norm="group",
+        feat_extract_dropout=0.0,
+        feat_extract_activation="gelu",
+        conv_dim=(32, 32, 32),
+        conv_stride=(4, 4, 4),
+        conv_kernel=(8, 8, 8),
+        conv_bias=False,
+        num_conv_pos_embeddings=16,
+        num_conv_pos_embedding_groups=2,
+        num_hidden_layers=4,
+        num_attention_heads=2,
+        hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
+        intermediate_size=20,
+        layer_norm_eps=1e-5,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        mask_time_prob=0.5,
+        mask_time_length=2,
+        vocab_size=32,
+        do_stable_layer_norm=False,
+        num_adapter_layers=1,
+        adapter_stride=2,
+        tdnn_dim=(32, 32),
+        tdnn_kernel=(5, 3),
+        tdnn_dilation=(1, 2),
+        xvector_output_dim=32,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_dropout = feat_extract_dropout
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = conv_dim
+        self.conv_stride = conv_stride
+        self.conv_kernel = conv_kernel
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.intermediate_size = intermediate_size
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.do_stable_layer_norm = do_stable_layer_norm
+        self.num_adapter_layers = num_adapter_layers
+        self.adapter_stride = adapter_stride
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.scope = scope
+        self.tdnn_dim = tdnn_dim
+        self.tdnn_kernel = tdnn_kernel
+        self.tdnn_dilation = tdnn_dilation
+        self.xvector_output_dim = xvector_output_dim
+
+        output_seq_length = self.seq_length
+        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
+            output_seq_length = (output_seq_length - (kernel - 1)) / stride
+        self.output_seq_length = int(math.ceil(output_seq_length))
+        self.encoder_seq_length = self.output_seq_length
+
+        self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1
+
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+
+        return config, input_values, attention_mask
+
+    def get_config(self):
+        return Data2VecAudioConfig(
+            hidden_size=self.hidden_size,
+            feat_extract_norm=self.feat_extract_norm,
+            feat_extract_dropout=self.feat_extract_dropout,
+            feat_extract_activation=self.feat_extract_activation,
+            conv_dim=self.conv_dim,
+            conv_stride=self.conv_stride,
+            conv_kernel=self.conv_kernel,
+            conv_bias=self.conv_bias,
+            mask_time_prob=self.mask_time_prob,
+            mask_time_length=self.mask_time_length,
+            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
+            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            intermediate_size=self.intermediate_size,
+            layer_norm_eps=self.layer_norm_eps,
+            do_stable_layer_norm=self.do_stable_layer_norm,
+            hidden_act=self.hidden_act,
+            initializer_range=self.initializer_range,
+            vocab_size=self.vocab_size,
+            num_adapter_layers=self.num_adapter_layers,
+            adapter_stride=self.adapter_stride,
+            tdnn_dim=self.tdnn_dim,
+            tdnn_kernel=self.tdnn_kernel,
+            tdnn_dilation=self.tdnn_dilation,
+            xvector_output_dim=self.xvector_output_dim,
+        )
+
+    def create_and_check_model(self, config, input_values, attention_mask):
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
+        config.add_adapter = True
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask):
+        config.add_adapter = True
+        config.output_hidden_size = 8
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
+        )
+
+    def create_and_check_batch_inference(self, config, input_values, *args):
+        # test does not pass for models making use of `group_norm`
+        # check: https://github.com/pytorch/fairseq/issues/3227
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0.0
+
+        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
+
+        for i in range(input_values.shape[0]):
+            input_slice = input_values[i : i + 1, : input_lengths[i]]
+            output = model(input_slice).last_hidden_state
+
+            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
+            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
+
+    def check_ctc_loss(self, config, input_values, *args):
+        model = Data2VecAudioForCTC(config=config)
+        model.to(torch_device)
+
+        # make sure that dropout is disabled
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0
+
+        model.config.ctc_loss_reduction = "sum"
+        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+
+        model.config.ctc_loss_reduction = "mean"
+        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+
+        self.parent.assertTrue(isinstance(sum_loss, float))
+        self.parent.assertTrue(isinstance(mean_loss, float))
+
+    def check_seq_classifier_loss(self, config, input_values, *args):
+        model = Data2VecAudioForSequenceClassification(config=config)
+        model.to(torch_device)
+
+        # make sure that dropout is disabled
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0
+
+        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+        unmasked_loss = model(input_values, labels=labels).loss.item()
+
+        self.parent.assertTrue(isinstance(masked_loss, float))
+        self.parent.assertTrue(isinstance(unmasked_loss, float))
+        self.parent.assertTrue(masked_loss != unmasked_loss)
+
+    def check_ctc_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForCTC(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze feature encoder
+        model.freeze_feature_encoder()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+            if max_length_labels[i] < labels.shape[-1]:
+                # it's important that we make sure that target lenghts are at least
+                # one shorter than logit lenghts to prevent -inf
+                labels[i, max_length_labels[i] - 1 :] = -100
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_seq_classifier_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForSequenceClassification(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze everything but the classification head
+        model.freeze_base_model()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_xvector_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForXVector(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze everything but the classification head
+        model.freeze_base_model()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_labels_out_of_vocab(self, config, input_values, *args):
+        model = Data2VecAudioForCTC(config)
+        model.to(torch_device)
+        model.train()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)
+
+        with self.parent.assertRaises(ValueError):
+            model(input_values, labels=labels)
+
+    def prepare_config_and_inputs_for_common(self):
+        config, input_values, attention_mask = self.prepare_config_and_inputs()
+        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Data2VecAudioModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (Data2VecAudioForCTC, Data2VecAudioModel, Data2VecAudioForSequenceClassification)
+        if is_torch_available()
+        else ()
+    )
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = Data2VecAudioModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_adapter(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
+
+    def test_model_with_adapter_proj_dim(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
+
+    def test_ctc_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_loss(*config_and_inputs)
+
+    def test_seq_classifier_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
+
+    def test_ctc_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_training(*config_and_inputs)
+
+    def test_seq_classifier_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_training(*config_and_inputs)
+
+    def test_xvector_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_xvector_training(*config_and_inputs)
+
+    def test_labels_out_of_vocab(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
+
+    # Data2VecAudio has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # `input_ids` is renamed to `input_values`
+    def test_forward_signature(self):
+        pass
+
+    # Data2VecAudio cannot resize token embeddings
+    # since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # Data2VecAudio has no inputs_embeds
+    # and thus the `get_input_embeddings` fn
+    # is not implemented
+    def test_model_common_attributes(self):
+        pass
+
+    @is_pt_flax_cross_test
+    # non-robust architecture does not exist in Flax
+    def test_equivalence_flax_to_pt(self):
+        pass
+
+    @is_pt_flax_cross_test
+    # non-robust architecture does not exist in Flax
+    def test_equivalence_pt_to_flax(self):
+        pass
+
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        # set layer drop to 0
+        model.config.layerdrop = 0.0
+
+        input_values = inputs_dict["input_values"]
+
+        input_lengths = torch.tensor(
+            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
+        )
+        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
+
+        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
+        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
+        inputs_dict["labels"] = labels
+
+        outputs = model(**inputs_dict)
+
+        output = outputs[0]
+
+        # Encoder-/Decoder-only models
+        hidden_states = outputs.hidden_states[0]
+        attentions = outputs.attentions[0]
+
+        hidden_states.retain_grad()
+        attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+        self.assertIsNotNone(attentions.grad)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = [
+                    "conv.weight",
+                    "masked_spec_embed",
+                    "codevectors",
+                    "quantizer.weight_proj.weight",
+                    "project_hid.weight",
+                    "project_hid.bias",
+                    "project_q.weight",
+                    "project_q.bias",
+                    "feature_projection.projection.weight",
+                    "feature_projection.projection.bias",
+                    "objective.weight",
+                ]
+                if param.requires_grad:
+                    if any([x in name for x in uniform_init_parms]):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.data.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+        if hasattr(module, "codevectors") and module.codevectors is not None:
+            module.codevectors.data.fill_(3)
+        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
+            module.masked_spec_embed.data.fill_(3)
+
+    def test_mask_feature_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "patrickvonplaten/data2vec-base-960h", mask_feature_prob=0.2, mask_feature_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    def test_mask_time_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "patrickvonplaten/data2vec-base-960h", mask_time_prob=0.2, mask_time_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    @unittest.skip(reason="Feed forward chunking is not implemented")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = Data2VecAudioModel.from_pretrained("patrickvonplaten/data2vec-base")
+        self.assertIsNotNone(model)
+
+
+@require_torch
+class Data2VecAudioRobustModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            Data2VecAudioForCTC,
+            Data2VecAudioModel,
+            Data2VecAudioForSequenceClassification,
+            Data2VecAudioForAudioFrameClassification,
+            Data2VecAudioForXVector,
+        )
+        if is_torch_available()
+        else ()
+    )
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = Data2VecAudioModelTester(
+            self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True
+        )
+        self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_adapter(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
+
+    def test_model_with_adapter_proj_dim(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
+
+    def test_batched_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
+
+    def test_ctc_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_loss(*config_and_inputs)
+
+    def test_seq_classifier_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
+
+    def test_ctc_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_training(*config_and_inputs)
+
+    def test_seq_classifier_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_training(*config_and_inputs)
+
+    def test_xvector_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_xvector_training(*config_and_inputs)
+
+    def test_labels_out_of_vocab(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
+
+    # Data2VecAudio has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # `input_ids` is renamed to `input_values`
+    def test_forward_signature(self):
+        pass
+
+    # Data2VecAudio cannot resize token embeddings
+    # since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # Data2VecAudio has no inputs_embeds
+    # and thus the `get_input_embeddings` fn
+    # is not implemented
+    def test_model_common_attributes(self):
+        pass
+
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        # set layer drop to 0
+        model.config.layerdrop = 0.0
+
+        input_values = inputs_dict["input_values"]
+
+        input_lengths = torch.tensor(
+            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
+        )
+        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
+
+        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
+        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
+        inputs_dict["labels"] = labels
+
+        outputs = model(**inputs_dict)
+
+        output = outputs[0]
+
+        # Encoder-/Decoder-only models
+        hidden_states = outputs.hidden_states[0]
+        attentions = outputs.attentions[0]
+
+        hidden_states.retain_grad()
+        attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+        self.assertIsNotNone(attentions.grad)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = [
+                    "conv.weight",
+                    "masked_spec_embed",
+                    "codevectors",
+                    "quantizer.weight_proj.weight",
+                    "project_hid.weight",
+                    "project_hid.bias",
+                    "project_q.weight",
+                    "project_q.bias",
+                    "feature_projection.projection.weight",
+                    "feature_projection.projection.bias",
+                    "objective.weight",
+                ]
+                if param.requires_grad:
+                    if any([x in name for x in uniform_init_parms]):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.data.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+        if hasattr(module, "codevectors") and module.codevectors is not None:
+            module.codevectors.data.fill_(3)
+        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
+            module.masked_spec_embed.data.fill_(3)
+
+    def test_mask_feature_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "patrickvonplaten/data2vec-base-960h", mask_feature_prob=0.2, mask_feature_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    def test_mask_time_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "patrickvonplaten/data2vec-base-960h", mask_time_prob=0.2, mask_time_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    def test_mask_time_feature_prob_ctc_single_batch(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "patrickvonplaten/data2vec-base-960h",
+            mask_time_prob=0.2,
+            mask_feature_prob=0.2,
+            mask_time_length=2,
+            mask_feature_length=2,
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (1, 1498, 32))
+
+    @unittest.skip(reason="Feed forward chunking is not implemented")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = Data2VecAudioModel.from_pretrained("patrickvonplaten/data2vec-base")
+        self.assertIsNotNone(model)
+
+
+@require_torch
+class Data2VecAudioUtilsTest(unittest.TestCase):
+    def test_compute_mask_indices(self):
+        batch_size = 4
+        sequence_length = 60
+        mask_prob = 0.5
+        mask_length = 1
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
+
+    def test_compute_mask_indices_low_prob(self):
+        # with these settings num_masked_spans=0.5, which means probabilistic rounding
+        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
+        # the other 5 out of 10, cases num_masked_spans=1
+        n_trials = 100
+        batch_size = 4
+        sequence_length = 100
+        mask_prob = 0.05
+        mask_length = 10
+
+        count_dimensions_masked = 0
+        count_dimensions_not_masked = 0
+
+        for _ in range(n_trials):
+            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+            mask = torch.from_numpy(mask).to(torch_device)
+
+            num_masks = torch.sum(mask).item()
+
+            if num_masks > 0:
+                count_dimensions_masked += 1
+            else:
+                count_dimensions_not_masked += 1
+
+        # as we test for at least 10 masked dimension and at least
+        # 10 non-masked dimension, this test could fail with probability:
+        # P(100 coin flips, at most 9 heads) = 1.66e-18
+        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
+        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
+
+    def test_compute_mask_indices_overlap(self):
+        batch_size = 4
+        sequence_length = 80
+        mask_prob = 0.5
+        mask_length = 4
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
+        for batch_sum in mask.sum(axis=-1):
+            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
+
+    def test_compute_mask_indices_attn_mask_overlap(self):
+        batch_size = 4
+        sequence_length = 80
+        mask_prob = 0.5
+        mask_length = 4
+
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        attention_mask[:2, sequence_length // 2 :] = 0
+
+        mask = _compute_mask_indices(
+            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
+        )
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        for batch_sum in mask.sum(axis=-1):
+            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
+
+        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
+
+    def test_compute_mask_indices_short_audio(self):
+        batch_size = 4
+        sequence_length = 100
+        mask_prob = 0.05
+        mask_length = 10
+
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        # force one example to be heavily padded
+        attention_mask[0, 5:] = 0
+
+        mask = _compute_mask_indices(
+            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
+        )
+
+        # make sure that non-padded examples cannot be padded
+        self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any())
+
+    def test_compute_perplexity(self):
+        probs = torch.arange(100, device=torch_device).reshape(2, 5, 10) / 100
+
+        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs)
+        self.assertTrue(abs(ppl.item() - 141.4291) < 1e-3)
+
+        # mask half of the input
+        mask = torch.ones((2,), device=torch_device, dtype=torch.bool)
+        mask[0] = 0
+
+        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs, mask)
+        self.assertTrue(abs(ppl.item() - 58.6757) < 1e-3)
+
+    def test_sample_negatives(self):
+        batch_size = 2
+        sequence_length = 10
+        hidden_size = 4
+        num_negatives = 3
+
+        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
+            sequence_length, hidden_size
+        )  # each value in vector consits of same value
+        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
+
+        # sample negative indices
+        sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None)
+        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
+        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
+        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
+        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
+
+        # make sure no negatively sampled vector is actually a positive one
+        for negative in negatives:
+            self.assertTrue(((negative - features) == 0).sum() == 0.0)
+
+        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
+        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
+
+    def test_sample_negatives_with_mask(self):
+        batch_size = 2
+        sequence_length = 10
+        hidden_size = 4
+        num_negatives = 3
+
+        # second half of last input tensor is padded
+        mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        mask[-1, sequence_length // 2 :] = 0
+
+        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
+            sequence_length, hidden_size
+        )  # each value in vector consits of same value
+        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
+
+        # replace masked feature vectors with -100 to test that those are not sampled
+        features = torch.where(mask[:, :, None].expand(features.shape).bool(), features, -100)
+
+        # sample negative indices
+        sampled_negative_indices = _sample_negative_indices(
+            (batch_size, sequence_length), num_negatives, mask.cpu().numpy()
+        )
+        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
+        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
+        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
+
+        self.assertTrue((negatives >= 0).all().item())
+
+        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
+
+        # make sure no negatively sampled vector is actually a positive one
+        for negative in negatives:
+            self.assertTrue(((negative - features) == 0).sum() == 0.0)
+
+        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
+        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
+
+
+@require_torch
+@require_soundfile
+@slow
+class Data2VecAudioModelIntegrationTest(unittest.TestCase):
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").filter(
+            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
+        )[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def _load_superb(self, task, num_samples):
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+
+        return ds[:num_samples]
+
+    def test_inference_ctc_normal(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
+        model.to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+        input_speech = self._load_datasamples(1)
+
+        input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_ctc_normal_batched(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
+        model.to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+
+        input_speech = self._load_datasamples(2)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "a man said to the universe sir i exist",
+            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
+        ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_ctc_robust_batched(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+
+        input_speech = self._load_datasamples(4)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values, attention_mask=attention_mask).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "a man said to the universe sir i exist",
+            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
+            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
+            "his instant panic was followed by a small sharp blow high on his chest",
+        ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_keyword_spotting(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+        input_data = self._load_superb("ks", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
+
+        expected_labels = [7, 6, 10, 9]
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor([6.1186, 11.8961, 10.2931, 6.0898], device=torch_device)
+
+        self.assertListEqual(predicted_ids.tolist(), expected_labels)
+        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
+
+    def test_inference_intent_classification(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("ic", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+
+        predicted_logits_action, predicted_ids_action = torch.max(outputs.logits[:, :6], dim=-1)
+        predicted_logits_object, predicted_ids_object = torch.max(outputs.logits[:, 6:20], dim=-1)
+        predicted_logits_location, predicted_ids_location = torch.max(outputs.logits[:, 20:24], dim=-1)
+
+        expected_labels_action = [0, 0, 2, 3]
+        expected_logits_action = torch.tensor([0.4568, 11.0848, 1.6621, 9.3841], device=torch_device)
+        expected_labels_object = [3, 10, 3, 4]
+        expected_logits_object = torch.tensor([1.5322, 10.7094, 5.2469, 22.1318], device=torch_device)
+        expected_labels_location = [0, 0, 0, 1]
+        expected_logits_location = torch.tensor([1.5335, 6.5096, 10.5704, 11.0569], device=torch_device)
+
+        self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
+        self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
+        self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)
+
+        self.assertTrue(torch.allclose(predicted_logits_action, expected_logits_action, atol=1e-2))
+        self.assertTrue(torch.allclose(predicted_logits_object, expected_logits_object, atol=1e-2))
+        self.assertTrue(torch.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
+
+    def test_inference_speaker_identification(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("si", 4)
+
+        output_logits = []
+        with torch.no_grad():
+            for example in input_data["speech"]:
+                input = processor(example, return_tensors="pt", padding=True)
+                output = model(input.input_values.to(torch_device), attention_mask=None)
+                output_logits.append(output.logits[0])
+        output_logits = torch.stack(output_logits)
+        predicted_logits, predicted_ids = torch.max(output_logits, dim=-1)
+
+        expected_labels = [251, 1, 1, 3]
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor([37.5627, 71.6362, 64.2419, 31.7778], device=torch_device)
+
+        self.assertListEqual(predicted_ids.tolist(), expected_labels)
+        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
+
+    def test_inference_emotion_recognition(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("er", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
+
+        expected_labels = [1, 1, 2, 2]
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor([2.1722, 3.0779, 8.0287, 6.6797], device=torch_device)
+
+        self.assertListEqual(predicted_ids.tolist(), expected_labels)
+        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
+
+    def test_phoneme_recognition(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("superb/wav2vec2-base-superb-ic")
+
+        input_speech = self._load_datasamples(4)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values, attention_mask=attention_mask).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "ɐ m æ n s ɛ d t ə ð ə j uː n ɪ v ɚ s s ɚ aɪ ɛ ɡ z ɪ s t",
+            "s w ɛ t k ʌ v ɚ d b ɹ iː ɔ n z b ɑː d i t ɹ ɪ k l ɪ ŋ ɪ n t ə ð ə t aɪ t l oɪ n k l ɑː θ ð æ w ʌ z ð ɪ oʊ n l i ɡ ɑːɹ m ə n t h iː w ɔːɹ",
+            "ð ə k aɪ t ɔ n h ɪ z tʃ ɛ s t s t ɪ l d ɹ ɪ p ɪ ŋ b l ʌ d ð ɪ eɪ k ʌ v h ɪ z oʊ v ɚ s t ɹ eɪ n d aɪ z iː v ə n ð ə s ɔːɹ ɹ ɪ ŋ ɐ ɹ iː n ɐ ɚ ɹ aʊ n d h ɪ m w ɪ ð ə θ aʊ z ə n d z ʌ v s p ɛ k t eɪ ɾ ɚ z w ɜː t ɹ ɪ v ɪ æ l ᵻ ɾ i z n ɑː t w ɜː θ θ ɪ ŋ k ɪ ŋ ɐ b aʊ t",
+            "h ɪ z ɪ n s t ə n t v p æ n ɪ k w ʌ z f ɑː l oʊ d b aɪ ɐ s m ɔː l ʃ ɑːɹ p b l oʊ h aɪ ɔ n h ɪ z tʃ ɛ s t",
+        ]
+        # should correspond to =>:
+        # [
+        # "a man said to the universe sir i exist",
+        # "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
+        # "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
+        # "his instant panic was followed by a small sharp blow high on his chest",
+        # ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_diarization(self):
+        model = Data2VecAudioForAudioFrameClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("sd", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+        # labels is a one-hot array of shape (num_frames, num_speakers)
+        labels = (outputs.logits > 0).long()
+
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor(
+            [
+                [[-5.2807, -5.1272], [-5.4059, -4.7757], [-5.2764, -4.9621], [-5.0117, -4.5851]],
+                [[-1.7643, -0.5462], [-1.7369, -0.2649], [-1.5066, -0.6200], [-4.5703, -2.4863]],
+                [[-0.8656, -0.4783], [-0.8899, -0.3289], [-0.9267, -0.5781], [-0.7817, -0.4619]],
+                [[-4.8625, -2.5316], [-5.2339, -2.2155], [-4.9835, -2.0344], [-4.4727, -1.8421]],
+            ],
+            device=torch_device,
+        )
+        self.assertEqual(labels[0, :, 0].sum(), 555)
+        self.assertEqual(labels[0, :, 1].sum(), 299)
+        # TODO: update the tolerance after the CI moves to torch 1.10
+        self.assertTrue(torch.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2))
+
+    def test_inference_speaker_verification(self):
+        model = Data2VecAudioForXVector.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("si", 4)
+
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
+        labels = torch.tensor([5, 1, 1, 3], device=torch_device).T
+
+        with torch.no_grad():
+            input_values = inputs.input_values.to(torch_device)
+            attention_mask = inputs.attention_mask.to(torch_device)
+            outputs = model(input_values, attention_mask=attention_mask, labels=labels)
+        embeddings = torch.nn.functional.normalize(outputs.embeddings, dim=-1).cpu()
+
+        cosine_sim = torch.nn.CosineSimilarity(dim=-1)
+        # id10002 vs id10002
+        self.assertAlmostEqual(cosine_sim(embeddings[1], embeddings[2]).numpy(), 0.9758, 3)
+        # id10006 vs id10002
+        self.assertAlmostEqual(cosine_sim(embeddings[0], embeddings[1]).numpy(), 0.7579, 3)
+        # id10002 vs id10004
+        self.assertAlmostEqual(cosine_sim(embeddings[2], embeddings[3]).numpy(), 0.7594, 3)
+
+        # TODO: update the tolerance after the CI moves to torch 1.10
+        self.assertAlmostEqual(outputs.loss.item(), 17.7963, 2)
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 3391c6bfbbd1c..6ea2b0a839791 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -88,7 +88,6 @@
     "TFRobertaForMultipleChoice",  # TODO: fix
     "TrOCRDecoderWrapper",  # Building part of bigger (tested) model.
     "SeparableConv1D",  # Building part of bigger (tested) model.
-    "Data2VecAudioModel",  # TODO: Add tests
 ]
 
 # Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't
@@ -170,7 +169,6 @@
     "TFWav2Vec2ForCTC",
     "TFHubertForCTC",
     "Data2VecAudioModel",  # TODO: Fix
-    "Data2VecAudioForPreTraining",  # TODO: Fix
 ]
 
 # This is to make sure the transformers module imported is the one in the repo.

From b1365f77da22698c0d66ce96287483e5f321f841 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 14:28:50 +0000
Subject: [PATCH 095/121] correct autio models

---
 .../models/auto/configuration_auto.py         |    8 +-
 src/transformers/models/auto/modeling_auto.py |   30 +-
 .../data2vec/configuration_data2vec_audio.py  |    8 +-
 .../data2vec/configuration_data2vec_text.py   |    2 +-
 .../models/data2vec/data2vec_audio.py         |    4 +-
 .../data2vec/modeling_data2vec_audio.py       |    5 +-
 .../models/data2vec/modeling_data2vec_text.py |    2 +-
 tests/test_modeling_common.py                 |    1 +
 tests/test_modeling_data2vec.py               | 1520 +++++++++++++++++
 tests/test_modeling_data2vec_audio.py         |  264 +--
 tests/test_modeling_data2vec_text.py          |    1 -
 11 files changed, 1552 insertions(+), 293 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index a63fc53f497ef..fa96166f5b079 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -122,6 +122,8 @@
         ("unispeech-sat", "UniSpeechSatConfig"),
         ("unispeech", "UniSpeechConfig"),
         ("wavlm", "WavLMConfig"),
+        ("data2vec-audio", "Data2VecAudioConfig"),
+        ("data2vec-text", "Data2VecTextConfig"),
     ]
 )
 
@@ -324,10 +326,14 @@
         ("xlsr_wav2vec2", "XLSR-Wav2Vec2"),
         ("mluke", "mLUKE"),
         ("layoutxlm", "LayoutXLM"),
+        ("data2vec-audio", "Data2VecAudio"),
+        ("data2vec-text", "Data2VecText"),
     ]
 )
 
-SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict([("openai-gpt", "openai")])
+SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
+    [("openai-gpt", "openai"), ("data2vec-audio", "data2vec"), ("data2vec-text", "data2vec")]
+)
 
 
 def model_type_to_module_name(key):
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 4ebbe0f9bd4f4..25b49b309799a 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -83,7 +83,8 @@
         ("bart", "BartModel"),
         ("longformer", "LongformerModel"),
         ("roberta", "RobertaModel"),
-        ("data2vec", "Data2VecTextModel"),
+        ("data2vec-text", "Data2VecTextModel"),
+        ("data2vec-audio", "Data2VecAudioModel"),
         ("layoutlm", "LayoutLMModel"),
         ("squeezebert", "SqueezeBertModel"),
         ("bert", "BertModel"),
@@ -134,7 +135,7 @@
         ("fsmt", "FSMTForConditionalGeneration"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
-        ("data2vec", "Data2VecTextForMaskedLM"),
+        ("data2vec-text", "Data2VecTextForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForPreTraining"),
         ("big_bird", "BigBirdForPreTraining"),
@@ -193,7 +194,7 @@
         ("bart", "BartForConditionalGeneration"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
-        ("data2vec", "Data2VecTextForMaskedLM"),
+        ("data2vec-text", "Data2VecTextForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForMaskedLM"),
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
@@ -234,7 +235,6 @@
         ("xlm-roberta-xl", "XLMRobertaXLForCausalLM"),
         ("xlm-roberta", "XLMRobertaForCausalLM"),
         ("roberta", "RobertaForCausalLM"),
-        ("data2vec", "Data2VecForCausalLM"),
         ("bert", "BertLMHeadModel"),
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
         ("gpt2", "GPT2LMHeadModel"),
@@ -255,7 +255,7 @@
         ("blenderbot-small", "BlenderbotSmallForCausalLM"),
         ("megatron-bert", "MegatronBertForCausalLM"),
         ("speech_to_text_2", "Speech2Text2ForCausalLM"),
-        ("data2vec", "Data2VecTextForCausalLM"),
+        ("data2vec-text", "Data2VecTextForCausalLM"),
     ]
 )
 
@@ -342,7 +342,7 @@
         ("xlm-roberta", "XLMRobertaForMaskedLM"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
-        ("data2vec", "Data2VecTextForMaskedLM"),
+        ("data2vec-text", "Data2VecTextForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForMaskedLM"),
         ("megatron-bert", "MegatronBertForMaskedLM"),
@@ -423,7 +423,7 @@
         ("bart", "BartForSequenceClassification"),
         ("longformer", "LongformerForSequenceClassification"),
         ("roberta", "RobertaForSequenceClassification"),
-        ("data2vec", "Data2VecTextForSequenceClassification"),
+        ("data2vec-text", "Data2VecTextForSequenceClassification"),
         ("squeezebert", "SqueezeBertForSequenceClassification"),
         ("layoutlm", "LayoutLMForSequenceClassification"),
         ("bert", "BertForSequenceClassification"),
@@ -473,7 +473,6 @@
         ("xlm-roberta-xl", "XLMRobertaXLForQuestionAnswering"),
         ("xlm-roberta", "XLMRobertaForQuestionAnswering"),
         ("roberta", "RobertaForQuestionAnswering"),
-        ("data2vec", "Data2VecForQuestionAnswering"),
         ("squeezebert", "SqueezeBertForQuestionAnswering"),
         ("bert", "BertForQuestionAnswering"),
         ("xlnet", "XLNetForQuestionAnsweringSimple"),
@@ -490,7 +489,7 @@
         ("deberta-v2", "DebertaV2ForQuestionAnswering"),
         ("ibert", "IBertForQuestionAnswering"),
         ("splinter", "SplinterForQuestionAnswering"),
-        ("data2vec", "Data2VecTextForQuestionAnswering"),
+        ("data2vec-text", "Data2VecTextForQuestionAnswering"),
     ]
 )
 
@@ -523,7 +522,6 @@
         ("xlm-roberta", "XLMRobertaForTokenClassification"),
         ("longformer", "LongformerForTokenClassification"),
         ("roberta", "RobertaForTokenClassification"),
-        ("data2vec", "Data2VecForTokenClassification"),
         ("squeezebert", "SqueezeBertForTokenClassification"),
         ("bert", "BertForTokenClassification"),
         ("megatron-bert", "MegatronBertForTokenClassification"),
@@ -537,7 +535,7 @@
         ("deberta-v2", "DebertaV2ForTokenClassification"),
         ("gpt2", "GPT2ForTokenClassification"),
         ("ibert", "IBertForTokenClassification"),
-        ("data2vec", "Data2VecTextForTokenClassification"),
+        ("data2vec-text", "Data2VecTextForTokenClassification"),
     ]
 )
 
@@ -559,7 +557,7 @@
         ("xlm-roberta", "XLMRobertaForMultipleChoice"),
         ("longformer", "LongformerForMultipleChoice"),
         ("roberta", "RobertaForMultipleChoice"),
-        ("data2vec", "Data2VecTextForMultipleChoice"),
+        ("data2vec-text", "Data2VecTextForMultipleChoice"),
         ("squeezebert", "SqueezeBertForMultipleChoice"),
         ("bert", "BertForMultipleChoice"),
         ("distilbert", "DistilBertForMultipleChoice"),
@@ -595,7 +593,7 @@
         ("sew", "SEWForSequenceClassification"),
         ("sew-d", "SEWDForSequenceClassification"),
         ("wavlm", "WavLMForSequenceClassification"),
-        ("data2vec", "Data2VecAudioForSequenceClassification"),
+        ("data2vec-audio", "Data2VecAudioForSequenceClassification"),
     ]
 )
 
@@ -609,7 +607,7 @@
         ("sew", "SEWForCTC"),
         ("sew-d", "SEWDForCTC"),
         ("wavlm", "WavLMForCTC"),
-        ("data2vec", "Data2VecAudioForCTC"),
+        ("data2vec-audio", "Data2VecAudioForCTC"),
     ]
 )
 
@@ -619,7 +617,7 @@
         ("wav2vec2", "Wav2Vec2ForAudioFrameClassification"),
         ("unispeech-sat", "UniSpeechSatForAudioFrameClassification"),
         ("wavlm", "WavLMForAudioFrameClassification"),
-        ("data2vec", "Data2VecAudioForAudioFrameClassification"),
+        ("data2vec-audio", "Data2VecAudioForAudioFrameClassification"),
     ]
 )
 
@@ -629,7 +627,7 @@
         ("wav2vec2", "Wav2Vec2ForXVector"),
         ("unispeech-sat", "UniSpeechSatForXVector"),
         ("wavlm", "WavLMForXVector"),
-        ("data2vec", "Data2VecAudioForXVector"),
+        ("data2vec-audio", "Data2VecAudioForXVector"),
     ]
 )
 
diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py
index c6e5780e86294..61c302fd7c21d 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_audio.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Data2VecText configuration"""
 
+import math
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -64,10 +66,6 @@ class Data2VecAudioConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
-            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
-            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
-            convolutional layers.
         feat_proj_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for output of the feature encoder.
         feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
@@ -202,7 +200,6 @@ def __init__(
         layerdrop=0.1,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
-        feat_extract_norm="group",
         feat_extract_activation="gelu",
         conv_dim=(512, 512, 512, 512, 512, 512, 512),
         conv_stride=(5, 2, 2, 2, 2, 2, 2),
@@ -244,7 +241,6 @@ def __init__(
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
         self.feat_extract_activation = feat_extract_activation
         self.conv_dim = list(conv_dim)
         self.conv_stride = list(conv_stride)
diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py
index 2389afd215840..3c7d6765dd314 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_text.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_text.py
@@ -93,7 +93,7 @@ class Data2VecTextConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "data2vec_text"
+    model_type = "data2vec-text"
 
     def __init__(
         self,
diff --git a/src/transformers/models/data2vec/data2vec_audio.py b/src/transformers/models/data2vec/data2vec_audio.py
index a5d91b82186f1..1c290d209d31e 100644
--- a/src/transformers/models/data2vec/data2vec_audio.py
+++ b/src/transformers/models/data2vec/data2vec_audio.py
@@ -452,13 +452,13 @@ def compute_var(y):
         if dist.is_initialized():
             zc = torch.tensor(y.size(0)).cuda()
             zs = y.sum(dim=0)
-            zss = (y ** 2).sum(dim=0)
+            zss = (y**2).sum(dim=0)
 
             dist.all_reduce(zc)
             dist.all_reduce(zs)
             dist.all_reduce(zss)
 
-            var = zss / (zc - 1) - (zs ** 2) / (zc * (zc - 1))
+            var = zss / (zc - 1) - (zs**2) / (zc * (zc - 1))
             return torch.sqrt(var + 1e-6).mean()
         else:
             return torch.sqrt(y.var(dim=0) + 1e-6).mean()
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index aad099fa7dce9..78e071a0064ef 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -33,7 +33,8 @@
     add_start_docstrings_to_model_forward,
 )
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput
-from ...modeling_utils import PreTrainedModel, torch_int_div
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_int_div
 from ...utils import logging
 from .configuration_data2vec_audio import Data2VecAudioConfig
 
@@ -455,7 +456,7 @@ def __init__(
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                 f" and `num_heads`: {num_heads})."
             )
-        self.scaling = self.head_dim ** -0.5
+        self.scaling = self.head_dim**-0.5
         self.is_decoder = is_decoder
 
         self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 4db194150bdb6..720e41481ffd4 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -66,7 +66,7 @@
 _CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
 _CTC_EXPECTED_LOSS = 53.48
 
-DATA2VECTEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "data2vec",
     # See all data2vec models at https://huggingface.co/models?filter=data2vec
 ]
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 17888bcfac380..21ee53a824b0d 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1911,6 +1911,7 @@ def test_load_with_mismatched_shapes(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
+            print(model_class)
             if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
                 continue
 
diff --git a/tests/test_modeling_data2vec.py b/tests/test_modeling_data2vec.py
index 03d97f222fd8d..15ceb29ff9e0f 100644
--- a/tests/test_modeling_data2vec.py
+++ b/tests/test_modeling_data2vec.py
@@ -12,3 +12,1523 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" Testing suite for the PyTorch Data2VecAudio model. """
+
+import math
+import unittest
+
+import numpy as np
+from datasets import load_dataset
+
+from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
+from transformers import Data2VecAudioConfig, Data2VecTextConfig, is_torch_available
+from transformers.testing_utils import (
+    TestCasePlus,
+    is_pt_flax_cross_test,
+    require_soundfile,
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, _config_zero_init
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Data2VecAudioForAudioFrameClassification,
+        Data2VecAudioForCTC,
+        Data2VecAudioForSequenceClassification,
+        Data2VecAudioForXVector,
+        Data2VecAudioModel,
+        Data2VecTextForCausalLM,
+        Data2VecTextForMaskedLM,
+        Data2VecTextForMultipleChoice,
+        Data2VecTextForQuestionAnswering,
+        Data2VecTextForSequenceClassification,
+        Data2VecTextForTokenClassification,
+        Data2VecTextModel,
+        Wav2Vec2FeatureExtractor,
+        Wav2Vec2Processor,
+    )
+    from transformers.models.data2vec.modeling_data2vec_text import (
+        DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        Data2VecTextForTextEmbeddings,
+        create_position_ids_from_input_ids,
+    )
+    from transformers.models.wav2vec2.modeling_wav2vec2 import (
+        Wav2Vec2GumbelVectorQuantizer,
+        _compute_mask_indices,
+        _sample_negative_indices,
+    )
+
+
+class Data2VecTextModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return Data2VecTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = Data2VecTextModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = Data2VecTextForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = Data2VecTextForCausalLM(config=config).to(torch_device).eval()
+
+        # make sure that ids don't start with pad token
+        mask = input_ids.ne(config.pad_token_id).long()
+        input_ids = input_ids * mask
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+
+        # make sure that ids don't start with pad token
+        mask = next_tokens.ne(config.pad_token_id).long()
+        next_tokens = next_tokens * mask
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = Data2VecTextForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = Data2VecTextForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Data2VecTextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            Data2VecTextForCausalLM,
+            Data2VecTextForMaskedLM,
+            Data2VecTextModel,
+            Data2VecTextForSequenceClassification,
+            Data2VecTextForTokenClassification,
+            Data2VecTextForMultipleChoice,
+            Data2VecTextForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (Data2VecTextForCausalLM,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = Data2VecTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Data2VecTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = Data2VecTextModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_create_position_ids_respects_padding_index(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = Data2VecTextForTextEmbeddings(config=config)
+
+        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
+        expected_positions = torch.as_tensor(
+            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
+        )
+
+        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+    def test_create_position_ids_from_inputs_embeds(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        embeddings = Data2VecTextForTextEmbeddings(config=config)
+
+        inputs_embeds = torch.empty(2, 4, 30)
+        expected_single_positions = [
+            0 + embeddings.padding_idx + 1,
+            1 + embeddings.padding_idx + 1,
+            2 + embeddings.padding_idx + 1,
+            3 + embeddings.padding_idx + 1,
+        ]
+        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
+        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+
+@require_torch
+class Data2VecTextModelIntegrationTest(TestCasePlus):
+    @slow
+    def test_inference_masked_lm(self):
+        model = Data2VecTextForMaskedLM.from_pretrained("data2vec")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 50265))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
+        )
+
+        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
+        # data2vec.eval()
+        # expected_slice = data2vec.model.forward(input_ids)[0][:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_no_head(self):
+        model = Data2VecTextModel.from_pretrained("data2vec")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
+        )
+
+        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
+        # data2vec.eval()
+        # expected_slice = data2vec.extract_features(input_ids)[:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_classification_head(self):
+        model = Data2VecTextForSequenceClassification.from_pretrained("data2vec-large-mnli")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 3))
+        self.assertEqual(output.shape, expected_shape)
+        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
+
+        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.large.mnli')
+        # data2vec.eval()
+        # expected_tensor = data2vec.predict("mnli", input_ids, return_logits=True).detach()
+
+        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
+
+
+class Data2VecAudioModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=1024,  # speech is longer
+        is_training=False,
+        hidden_size=16,
+        feat_extract_dropout=0.0,
+        feat_extract_activation="gelu",
+        conv_dim=(32, 32, 32),
+        conv_stride=(4, 4, 4),
+        conv_kernel=(8, 8, 8),
+        conv_bias=False,
+        num_conv_pos_embeddings=16,
+        num_conv_pos_embedding_groups=2,
+        num_hidden_layers=4,
+        num_attention_heads=2,
+        hidden_dropout_prob=0.1,
+        intermediate_size=20,
+        layer_norm_eps=1e-5,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        mask_time_prob=0.5,
+        mask_time_length=2,
+        vocab_size=32,
+        num_adapter_layers=1,
+        adapter_stride=2,
+        tdnn_dim=(32, 32),
+        tdnn_kernel=(5, 3),
+        tdnn_dilation=(1, 2),
+        xvector_output_dim=32,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.feat_extract_dropout = feat_extract_dropout
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = conv_dim
+        self.conv_stride = conv_stride
+        self.conv_kernel = conv_kernel
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.intermediate_size = intermediate_size
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.num_adapter_layers = num_adapter_layers
+        self.adapter_stride = adapter_stride
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.scope = scope
+        self.tdnn_dim = tdnn_dim
+        self.tdnn_kernel = tdnn_kernel
+        self.tdnn_dilation = tdnn_dilation
+        self.xvector_output_dim = xvector_output_dim
+
+        output_seq_length = self.seq_length
+        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
+            output_seq_length = (output_seq_length - (kernel - 1)) / stride
+        self.output_seq_length = int(math.ceil(output_seq_length))
+        self.encoder_seq_length = self.output_seq_length
+
+        self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1
+
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+
+        return config, input_values, attention_mask
+
+    def get_config(self):
+        return Data2VecAudioConfig(
+            hidden_size=self.hidden_size,
+            feat_extract_dropout=self.feat_extract_dropout,
+            feat_extract_activation=self.feat_extract_activation,
+            conv_dim=self.conv_dim,
+            conv_stride=self.conv_stride,
+            conv_kernel=self.conv_kernel,
+            conv_bias=self.conv_bias,
+            mask_time_prob=self.mask_time_prob,
+            mask_time_length=self.mask_time_length,
+            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
+            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            intermediate_size=self.intermediate_size,
+            layer_norm_eps=self.layer_norm_eps,
+            hidden_act=self.hidden_act,
+            initializer_range=self.initializer_range,
+            vocab_size=self.vocab_size,
+            num_adapter_layers=self.num_adapter_layers,
+            adapter_stride=self.adapter_stride,
+            tdnn_dim=self.tdnn_dim,
+            tdnn_kernel=self.tdnn_kernel,
+            tdnn_dilation=self.tdnn_dilation,
+            xvector_output_dim=self.xvector_output_dim,
+        )
+
+    def create_and_check_model(self, config, input_values, attention_mask):
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
+        config.add_adapter = True
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask):
+        config.add_adapter = True
+        config.output_hidden_size = 8
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
+        )
+
+    def create_and_check_batch_inference(self, config, input_values, *args):
+        # test does not pass for models making use of `group_norm`
+        # check: https://github.com/pytorch/fairseq/issues/3227
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0.0
+
+        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
+
+        for i in range(input_values.shape[0]):
+            input_slice = input_values[i : i + 1, : input_lengths[i]]
+            output = model(input_slice).last_hidden_state
+
+            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
+            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
+
+    def check_ctc_loss(self, config, input_values, *args):
+        model = Data2VecAudioForCTC(config=config)
+        model.to(torch_device)
+
+        # make sure that dropout is disabled
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0
+
+        model.config.ctc_loss_reduction = "sum"
+        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+
+        model.config.ctc_loss_reduction = "mean"
+        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+
+        self.parent.assertTrue(isinstance(sum_loss, float))
+        self.parent.assertTrue(isinstance(mean_loss, float))
+
+    def check_seq_classifier_loss(self, config, input_values, *args):
+        model = Data2VecAudioForSequenceClassification(config=config)
+        model.to(torch_device)
+
+        # make sure that dropout is disabled
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0
+
+        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+        unmasked_loss = model(input_values, labels=labels).loss.item()
+
+        self.parent.assertTrue(isinstance(masked_loss, float))
+        self.parent.assertTrue(isinstance(unmasked_loss, float))
+        self.parent.assertTrue(masked_loss != unmasked_loss)
+
+    def check_ctc_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForCTC(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze feature encoder
+        model.freeze_feature_encoder()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+            if max_length_labels[i] < labels.shape[-1]:
+                # it's important that we make sure that target lenghts are at least
+                # one shorter than logit lenghts to prevent -inf
+                labels[i, max_length_labels[i] - 1 :] = -100
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_seq_classifier_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForSequenceClassification(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze everything but the classification head
+        model.freeze_base_model()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_xvector_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForXVector(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze everything but the classification head
+        model.freeze_base_model()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_labels_out_of_vocab(self, config, input_values, *args):
+        model = Data2VecAudioForCTC(config)
+        model.to(torch_device)
+        model.train()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)
+
+        with self.parent.assertRaises(ValueError):
+            model(input_values, labels=labels)
+
+    def prepare_config_and_inputs_for_common(self):
+        config, input_values, attention_mask = self.prepare_config_and_inputs()
+        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Data2VecAudioModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (Data2VecAudioForCTC, Data2VecAudioModel, Data2VecAudioForSequenceClassification)
+        if is_torch_available()
+        else ()
+    )
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = Data2VecAudioModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_adapter(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
+
+    def test_model_with_adapter_proj_dim(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
+
+    def test_ctc_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_loss(*config_and_inputs)
+
+    def test_seq_classifier_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
+
+    def test_ctc_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_training(*config_and_inputs)
+
+    def test_seq_classifier_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_training(*config_and_inputs)
+
+    def test_xvector_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_xvector_training(*config_and_inputs)
+
+    def test_labels_out_of_vocab(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
+
+    # Data2VecAudio has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # `input_ids` is renamed to `input_values`
+    def test_forward_signature(self):
+        pass
+
+    # Data2VecAudio cannot resize token embeddings
+    # since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # Data2VecAudio has no inputs_embeds
+    # and thus the `get_input_embeddings` fn
+    # is not implemented
+    def test_model_common_attributes(self):
+        pass
+
+    @is_pt_flax_cross_test
+    # non-robust architecture does not exist in Flax
+    def test_equivalence_flax_to_pt(self):
+        pass
+
+    @is_pt_flax_cross_test
+    # non-robust architecture does not exist in Flax
+    def test_equivalence_pt_to_flax(self):
+        pass
+
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        # set layer drop to 0
+        model.config.layerdrop = 0.0
+
+        input_values = inputs_dict["input_values"]
+
+        input_lengths = torch.tensor(
+            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
+        )
+        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
+
+        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
+        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
+        inputs_dict["labels"] = labels
+
+        outputs = model(**inputs_dict)
+
+        output = outputs[0]
+
+        # Encoder-/Decoder-only models
+        hidden_states = outputs.hidden_states[0]
+        attentions = outputs.attentions[0]
+
+        hidden_states.retain_grad()
+        attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+        self.assertIsNotNone(attentions.grad)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = [
+                    "conv.weight",
+                    "masked_spec_embed",
+                    "codevectors",
+                    "quantizer.weight_proj.weight",
+                    "project_hid.weight",
+                    "project_hid.bias",
+                    "project_q.weight",
+                    "project_q.bias",
+                    "feature_projection.projection.weight",
+                    "feature_projection.projection.bias",
+                    "objective.weight",
+                ]
+                if param.requires_grad:
+                    if any([x in name for x in uniform_init_parms]):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.data.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+        if hasattr(module, "codevectors") and module.codevectors is not None:
+            module.codevectors.data.fill_(3)
+        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
+            module.masked_spec_embed.data.fill_(3)
+
+    def test_mask_feature_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "patrickvonplaten/data2vec-base-960h", mask_feature_prob=0.2, mask_feature_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    def test_mask_time_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "patrickvonplaten/data2vec-base-960h", mask_time_prob=0.2, mask_time_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    @unittest.skip(reason="Feed forward chunking is not implemented")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = Data2VecAudioModel.from_pretrained("patrickvonplaten/data2vec-base")
+        self.assertIsNotNone(model)
+
+
+@require_torch
+class Data2VecAudioUtilsTest(unittest.TestCase):
+    def test_compute_mask_indices(self):
+        batch_size = 4
+        sequence_length = 60
+        mask_prob = 0.5
+        mask_length = 1
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
+
+    def test_compute_mask_indices_low_prob(self):
+        # with these settings num_masked_spans=0.5, which means probabilistic rounding
+        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
+        # the other 5 out of 10, cases num_masked_spans=1
+        n_trials = 100
+        batch_size = 4
+        sequence_length = 100
+        mask_prob = 0.05
+        mask_length = 10
+
+        count_dimensions_masked = 0
+        count_dimensions_not_masked = 0
+
+        for _ in range(n_trials):
+            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+            mask = torch.from_numpy(mask).to(torch_device)
+
+            num_masks = torch.sum(mask).item()
+
+            if num_masks > 0:
+                count_dimensions_masked += 1
+            else:
+                count_dimensions_not_masked += 1
+
+        # as we test for at least 10 masked dimension and at least
+        # 10 non-masked dimension, this test could fail with probability:
+        # P(100 coin flips, at most 9 heads) = 1.66e-18
+        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
+        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
+
+    def test_compute_mask_indices_overlap(self):
+        batch_size = 4
+        sequence_length = 80
+        mask_prob = 0.5
+        mask_length = 4
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
+        for batch_sum in mask.sum(axis=-1):
+            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
+
+    def test_compute_mask_indices_attn_mask_overlap(self):
+        batch_size = 4
+        sequence_length = 80
+        mask_prob = 0.5
+        mask_length = 4
+
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        attention_mask[:2, sequence_length // 2 :] = 0
+
+        mask = _compute_mask_indices(
+            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
+        )
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        for batch_sum in mask.sum(axis=-1):
+            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
+
+        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
+
+    def test_compute_mask_indices_short_audio(self):
+        batch_size = 4
+        sequence_length = 100
+        mask_prob = 0.05
+        mask_length = 10
+
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        # force one example to be heavily padded
+        attention_mask[0, 5:] = 0
+
+        mask = _compute_mask_indices(
+            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
+        )
+
+        # make sure that non-padded examples cannot be padded
+        self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any())
+
+    def test_compute_perplexity(self):
+        probs = torch.arange(100, device=torch_device).reshape(2, 5, 10) / 100
+
+        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs)
+        self.assertTrue(abs(ppl.item() - 141.4291) < 1e-3)
+
+        # mask half of the input
+        mask = torch.ones((2,), device=torch_device, dtype=torch.bool)
+        mask[0] = 0
+
+        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs, mask)
+        self.assertTrue(abs(ppl.item() - 58.6757) < 1e-3)
+
+    def test_sample_negatives(self):
+        batch_size = 2
+        sequence_length = 10
+        hidden_size = 4
+        num_negatives = 3
+
+        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
+            sequence_length, hidden_size
+        )  # each value in vector consits of same value
+        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
+
+        # sample negative indices
+        sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None)
+        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
+        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
+        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
+        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
+
+        # make sure no negatively sampled vector is actually a positive one
+        for negative in negatives:
+            self.assertTrue(((negative - features) == 0).sum() == 0.0)
+
+        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
+        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
+
+    def test_sample_negatives_with_mask(self):
+        batch_size = 2
+        sequence_length = 10
+        hidden_size = 4
+        num_negatives = 3
+
+        # second half of last input tensor is padded
+        mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        mask[-1, sequence_length // 2 :] = 0
+
+        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
+            sequence_length, hidden_size
+        )  # each value in vector consits of same value
+        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
+
+        # replace masked feature vectors with -100 to test that those are not sampled
+        features = torch.where(mask[:, :, None].expand(features.shape).bool(), features, -100)
+
+        # sample negative indices
+        sampled_negative_indices = _sample_negative_indices(
+            (batch_size, sequence_length), num_negatives, mask.cpu().numpy()
+        )
+        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
+        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
+        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
+
+        self.assertTrue((negatives >= 0).all().item())
+
+        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
+
+        # make sure no negatively sampled vector is actually a positive one
+        for negative in negatives:
+            self.assertTrue(((negative - features) == 0).sum() == 0.0)
+
+        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
+        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
+
+
+@require_torch
+@require_soundfile
+@slow
+class Data2VecAudioModelIntegrationTest(unittest.TestCase):
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").filter(
+            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
+        )[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def _load_superb(self, task, num_samples):
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+
+        return ds[:num_samples]
+
+    def test_inference_ctc_normal(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
+        model.to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+        input_speech = self._load_datasamples(1)
+
+        input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_ctc_normal_batched(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
+        model.to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+
+        input_speech = self._load_datasamples(2)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "a man said to the universe sir i exist",
+            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
+        ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_ctc_robust_batched(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+
+        input_speech = self._load_datasamples(4)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values, attention_mask=attention_mask).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "a man said to the universe sir i exist",
+            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
+            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
+            "his instant panic was followed by a small sharp blow high on his chest",
+        ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_keyword_spotting(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+        input_data = self._load_superb("ks", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
+
+        expected_labels = [7, 6, 10, 9]
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor([6.1186, 11.8961, 10.2931, 6.0898], device=torch_device)
+
+        self.assertListEqual(predicted_ids.tolist(), expected_labels)
+        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
+
+    def test_inference_intent_classification(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("ic", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+
+        predicted_logits_action, predicted_ids_action = torch.max(outputs.logits[:, :6], dim=-1)
+        predicted_logits_object, predicted_ids_object = torch.max(outputs.logits[:, 6:20], dim=-1)
+        predicted_logits_location, predicted_ids_location = torch.max(outputs.logits[:, 20:24], dim=-1)
+
+        expected_labels_action = [0, 0, 2, 3]
+        expected_logits_action = torch.tensor([0.4568, 11.0848, 1.6621, 9.3841], device=torch_device)
+        expected_labels_object = [3, 10, 3, 4]
+        expected_logits_object = torch.tensor([1.5322, 10.7094, 5.2469, 22.1318], device=torch_device)
+        expected_labels_location = [0, 0, 0, 1]
+        expected_logits_location = torch.tensor([1.5335, 6.5096, 10.5704, 11.0569], device=torch_device)
+
+        self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
+        self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
+        self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)
+
+        self.assertTrue(torch.allclose(predicted_logits_action, expected_logits_action, atol=1e-2))
+        self.assertTrue(torch.allclose(predicted_logits_object, expected_logits_object, atol=1e-2))
+        self.assertTrue(torch.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
+
+    def test_inference_speaker_identification(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("si", 4)
+
+        output_logits = []
+        with torch.no_grad():
+            for example in input_data["speech"]:
+                input = processor(example, return_tensors="pt", padding=True)
+                output = model(input.input_values.to(torch_device), attention_mask=None)
+                output_logits.append(output.logits[0])
+        output_logits = torch.stack(output_logits)
+        predicted_logits, predicted_ids = torch.max(output_logits, dim=-1)
+
+        expected_labels = [251, 1, 1, 3]
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor([37.5627, 71.6362, 64.2419, 31.7778], device=torch_device)
+
+        self.assertListEqual(predicted_ids.tolist(), expected_labels)
+        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
+
+    def test_inference_emotion_recognition(self):
+        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("er", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
+
+        expected_labels = [1, 1, 2, 2]
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor([2.1722, 3.0779, 8.0287, 6.6797], device=torch_device)
+
+        self.assertListEqual(predicted_ids.tolist(), expected_labels)
+        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
+
+    def test_phoneme_recognition(self):
+        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("superb/wav2vec2-base-superb-ic")
+
+        input_speech = self._load_datasamples(4)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values, attention_mask=attention_mask).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "ɐ m æ n s ɛ d t ə ð ə j uː n ɪ v ɚ s s ɚ aɪ ɛ ɡ z ɪ s t",
+            "s w ɛ t k ʌ v ɚ d b ɹ iː ɔ n z b ɑː d i t ɹ ɪ k l ɪ ŋ ɪ n t ə ð ə t aɪ t l oɪ n k l ɑː θ ð æ w ʌ z ð ɪ oʊ n l i ɡ ɑːɹ m ə n t h iː w ɔːɹ",
+            "ð ə k aɪ t ɔ n h ɪ z tʃ ɛ s t s t ɪ l d ɹ ɪ p ɪ ŋ b l ʌ d ð ɪ eɪ k ʌ v h ɪ z oʊ v ɚ s t ɹ eɪ n d aɪ z iː v ə n ð ə s ɔːɹ ɹ ɪ ŋ ɐ ɹ iː n ɐ ɚ ɹ aʊ n d h ɪ m w ɪ ð ə θ aʊ z ə n d z ʌ v s p ɛ k t eɪ ɾ ɚ z w ɜː t ɹ ɪ v ɪ æ l ᵻ ɾ i z n ɑː t w ɜː θ θ ɪ ŋ k ɪ ŋ ɐ b aʊ t",
+            "h ɪ z ɪ n s t ə n t v p æ n ɪ k w ʌ z f ɑː l oʊ d b aɪ ɐ s m ɔː l ʃ ɑːɹ p b l oʊ h aɪ ɔ n h ɪ z tʃ ɛ s t",
+        ]
+        # should correspond to =>:
+        # [
+        # "a man said to the universe sir i exist",
+        # "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
+        # "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
+        # "his instant panic was followed by a small sharp blow high on his chest",
+        # ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_diarization(self):
+        model = Data2VecAudioForAudioFrameClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
+            torch_device
+        )
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("sd", 4)
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+        with torch.no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+        # labels is a one-hot array of shape (num_frames, num_speakers)
+        labels = (outputs.logits > 0).long()
+
+        # s3prl logits for the same batch
+        expected_logits = torch.tensor(
+            [
+                [[-5.2807, -5.1272], [-5.4059, -4.7757], [-5.2764, -4.9621], [-5.0117, -4.5851]],
+                [[-1.7643, -0.5462], [-1.7369, -0.2649], [-1.5066, -0.6200], [-4.5703, -2.4863]],
+                [[-0.8656, -0.4783], [-0.8899, -0.3289], [-0.9267, -0.5781], [-0.7817, -0.4619]],
+                [[-4.8625, -2.5316], [-5.2339, -2.2155], [-4.9835, -2.0344], [-4.4727, -1.8421]],
+            ],
+            device=torch_device,
+        )
+        self.assertEqual(labels[0, :, 0].sum(), 555)
+        self.assertEqual(labels[0, :, 1].sum(), 299)
+        # TODO: update the tolerance after the CI moves to torch 1.10
+        self.assertTrue(torch.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2))
+
+    def test_inference_speaker_verification(self):
+        model = Data2VecAudioForXVector.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
+        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("si", 4)
+
+        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
+        labels = torch.tensor([5, 1, 1, 3], device=torch_device).T
+
+        with torch.no_grad():
+            input_values = inputs.input_values.to(torch_device)
+            attention_mask = inputs.attention_mask.to(torch_device)
+            outputs = model(input_values, attention_mask=attention_mask, labels=labels)
+        embeddings = torch.nn.functional.normalize(outputs.embeddings, dim=-1).cpu()
+
+        cosine_sim = torch.nn.CosineSimilarity(dim=-1)
+        # id10002 vs id10002
+        self.assertAlmostEqual(cosine_sim(embeddings[1], embeddings[2]).numpy(), 0.9758, 3)
+        # id10006 vs id10002
+        self.assertAlmostEqual(cosine_sim(embeddings[0], embeddings[1]).numpy(), 0.7579, 3)
+        # id10002 vs id10004
+        self.assertAlmostEqual(cosine_sim(embeddings[2], embeddings[3]).numpy(), 0.7594, 3)
+
+        # TODO: update the tolerance after the CI moves to torch 1.10
+        self.assertAlmostEqual(outputs.loss.item(), 17.7963, 2)
diff --git a/tests/test_modeling_data2vec_audio.py b/tests/test_modeling_data2vec_audio.py
index 8a39f3d11e789..cf2eade935012 100644
--- a/tests/test_modeling_data2vec_audio.py
+++ b/tests/test_modeling_data2vec_audio.py
@@ -55,7 +55,6 @@ def __init__(
         seq_length=1024,  # speech is longer
         is_training=False,
         hidden_size=16,
-        feat_extract_norm="group",
         feat_extract_dropout=0.0,
         feat_extract_activation="gelu",
         conv_dim=(32, 32, 32),
@@ -66,7 +65,7 @@ def __init__(
         num_conv_pos_embedding_groups=2,
         num_hidden_layers=4,
         num_attention_heads=2,
-        hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
+        hidden_dropout_prob=0.1,
         intermediate_size=20,
         layer_norm_eps=1e-5,
         hidden_act="gelu",
@@ -74,7 +73,6 @@ def __init__(
         mask_time_prob=0.5,
         mask_time_length=2,
         vocab_size=32,
-        do_stable_layer_norm=False,
         num_adapter_layers=1,
         adapter_stride=2,
         tdnn_dim=(32, 32),
@@ -88,7 +86,6 @@ def __init__(
         self.seq_length = seq_length
         self.is_training = is_training
         self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
         self.feat_extract_dropout = feat_extract_dropout
         self.feat_extract_activation = feat_extract_activation
         self.conv_dim = conv_dim
@@ -105,7 +102,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.vocab_size = vocab_size
-        self.do_stable_layer_norm = do_stable_layer_norm
         self.num_adapter_layers = num_adapter_layers
         self.adapter_stride = adapter_stride
         self.mask_time_prob = mask_time_prob
@@ -135,7 +131,6 @@ def prepare_config_and_inputs(self):
     def get_config(self):
         return Data2VecAudioConfig(
             hidden_size=self.hidden_size,
-            feat_extract_norm=self.feat_extract_norm,
             feat_extract_dropout=self.feat_extract_dropout,
             feat_extract_activation=self.feat_extract_activation,
             conv_dim=self.conv_dim,
@@ -151,7 +146,6 @@ def get_config(self):
             hidden_dropout_prob=self.hidden_dropout_prob,
             intermediate_size=self.intermediate_size,
             layer_norm_eps=self.layer_norm_eps,
-            do_stable_layer_norm=self.do_stable_layer_norm,
             hidden_act=self.hidden_act,
             initializer_range=self.initializer_range,
             vocab_size=self.vocab_size,
@@ -595,262 +589,6 @@ def test_model_from_pretrained(self):
         self.assertIsNotNone(model)
 
 
-@require_torch
-class Data2VecAudioRobustModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            Data2VecAudioForCTC,
-            Data2VecAudioModel,
-            Data2VecAudioForSequenceClassification,
-            Data2VecAudioForAudioFrameClassification,
-            Data2VecAudioForXVector,
-        )
-        if is_torch_available()
-        else ()
-    )
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = Data2VecAudioModelTester(
-            self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True
-        )
-        self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
-
-    def test_model_with_adapter_proj_dim(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
-
-    def test_batched_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_xvector_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_xvector_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # Data2VecAudio has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_values`
-    def test_forward_signature(self):
-        pass
-
-    # Data2VecAudio cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # Data2VecAudio has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
-    def test_model_common_attributes(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_values = inputs_dict["input_values"]
-
-        input_lengths = torch.tensor(
-            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    def test_mask_feature_prob_ctc(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "patrickvonplaten/data2vec-base-960h", mask_feature_prob=0.2, mask_feature_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 299, 32))
-
-    def test_mask_time_prob_ctc(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "patrickvonplaten/data2vec-base-960h", mask_time_prob=0.2, mask_time_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 299, 32))
-
-    def test_mask_time_feature_prob_ctc_single_batch(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "patrickvonplaten/data2vec-base-960h",
-            mask_time_prob=0.2,
-            mask_feature_prob=0.2,
-            mask_time_length=2,
-            mask_feature_length=2,
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (1, 1498, 32))
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = Data2VecAudioModel.from_pretrained("patrickvonplaten/data2vec-base")
-        self.assertIsNotNone(model)
-
-
 @require_torch
 class Data2VecAudioUtilsTest(unittest.TestCase):
     def test_compute_mask_indices(self):
diff --git a/tests/test_modeling_data2vec_text.py b/tests/test_modeling_data2vec_text.py
index ffdc88fbd8f74..203ad1609e6b0 100644
--- a/tests/test_modeling_data2vec_text.py
+++ b/tests/test_modeling_data2vec_text.py
@@ -339,7 +339,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class Data2VecModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             Data2VecTextForCausalLM,

From 796ab6e5a3ce0051fafb8b393389944dd34a0d2d Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 14:53:20 +0000
Subject: [PATCH 096/121] correct config auto

---
 src/transformers/models/auto/configuration_auto.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index fa96166f5b079..5c42cad65f2d2 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -91,7 +91,6 @@
         ("reformer", "ReformerConfig"),
         ("longformer", "LongformerConfig"),
         ("roberta", "RobertaConfig"),
-        ("data2vec", "Data2VecTextConfig"),
         ("deberta-v2", "DebertaV2Config"),
         ("deberta", "DebertaConfig"),
         ("flaubert", "FlaubertConfig"),
@@ -180,7 +179,8 @@
         ("xlnet", "XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xlm", "XLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("roberta", "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("data2vec", "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("data2vec-text", "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("data2vec-audio", "DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("distilbert", "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -274,7 +274,6 @@
         ("reformer", "Reformer"),
         ("longformer", "Longformer"),
         ("roberta", "RoBERTa"),
-        ("data2vec", "Data2Vec"),
         ("flaubert", "FlauBERT"),
         ("fsmt", "FairSeq Machine-Translation"),
         ("squeezebert", "SqueezeBERT"),

From 71be48376bcfa9600ffc4494e5cbd3dbf9e74d37 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 14:54:20 +0000
Subject: [PATCH 097/121] correct tok auto

---
 src/transformers/models/auto/tokenization_auto.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index cdf84c0aad532..0c953f1636bf5 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -108,7 +108,6 @@
             ("bart", ("BartTokenizer", "BartTokenizerFast")),
             ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
             ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
-            ("data2vec", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "reformer",
                 (

From 65e80dd8253b9f0b10800c99cbef3613b8f1521f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 14:57:32 +0000
Subject: [PATCH 098/121] delete unnecessary files

---
 .../models/data2vec/configuration_data2vec.py      | 14 --------------
 1 file changed, 14 deletions(-)
 delete mode 100644 src/transformers/models/data2vec/configuration_data2vec.py

diff --git a/src/transformers/models/data2vec/configuration_data2vec.py b/src/transformers/models/data2vec/configuration_data2vec.py
deleted file mode 100644
index b0b4464ad1170..0000000000000
--- a/src/transformers/models/data2vec/configuration_data2vec.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.

From 553fb11f9f7580743bd41c15fedad8bac80df7d7 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 14:57:39 +0000
Subject: [PATCH 099/121] delete unnecessary files

---
 tests/test_modeling_data2vec_audio.py | 1025 -------------------------
 tests/test_modeling_data2vec_text.py  |  533 -------------
 2 files changed, 1558 deletions(-)
 delete mode 100644 tests/test_modeling_data2vec_audio.py
 delete mode 100644 tests/test_modeling_data2vec_text.py

diff --git a/tests/test_modeling_data2vec_audio.py b/tests/test_modeling_data2vec_audio.py
deleted file mode 100644
index cf2eade935012..0000000000000
--- a/tests/test_modeling_data2vec_audio.py
+++ /dev/null
@@ -1,1025 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Data2VecAudio model. """
-
-import math
-import unittest
-
-import numpy as np
-from datasets import load_dataset
-
-from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
-from transformers import Data2VecAudioConfig, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_soundfile, require_torch, slow, torch_device
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, _config_zero_init
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        Data2VecAudioForAudioFrameClassification,
-        Data2VecAudioForCTC,
-        Data2VecAudioForSequenceClassification,
-        Data2VecAudioForXVector,
-        Data2VecAudioModel,
-        Wav2Vec2FeatureExtractor,
-        Wav2Vec2Processor,
-    )
-    from transformers.models.wav2vec2.modeling_wav2vec2 import (
-        Wav2Vec2GumbelVectorQuantizer,
-        _compute_mask_indices,
-        _sample_negative_indices,
-    )
-
-
-class Data2VecAudioModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=1024,  # speech is longer
-        is_training=False,
-        hidden_size=16,
-        feat_extract_dropout=0.0,
-        feat_extract_activation="gelu",
-        conv_dim=(32, 32, 32),
-        conv_stride=(4, 4, 4),
-        conv_kernel=(8, 8, 8),
-        conv_bias=False,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
-        num_attention_heads=2,
-        hidden_dropout_prob=0.1,
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        mask_time_prob=0.5,
-        mask_time_length=2,
-        vocab_size=32,
-        num_adapter_layers=1,
-        adapter_stride=2,
-        tdnn_dim=(32, 32),
-        tdnn_kernel=(5, 3),
-        tdnn_dilation=(1, 2),
-        xvector_output_dim=32,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.num_adapter_layers = num_adapter_layers
-        self.adapter_stride = adapter_stride
-        self.mask_time_prob = mask_time_prob
-        self.mask_time_length = mask_time_length
-        self.scope = scope
-        self.tdnn_dim = tdnn_dim
-        self.tdnn_kernel = tdnn_kernel
-        self.tdnn_dilation = tdnn_dilation
-        self.xvector_output_dim = xvector_output_dim
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-        self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        return config, input_values, attention_mask
-
-    def get_config(self):
-        return Data2VecAudioConfig(
-            hidden_size=self.hidden_size,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            mask_time_prob=self.mask_time_prob,
-            mask_time_length=self.mask_time_length,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            num_adapter_layers=self.num_adapter_layers,
-            adapter_stride=self.adapter_stride,
-            tdnn_dim=self.tdnn_dim,
-            tdnn_kernel=self.tdnn_kernel,
-            tdnn_dilation=self.tdnn_dilation,
-            xvector_output_dim=self.xvector_output_dim,
-        )
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 8
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
-        )
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = Data2VecAudioForCTC(config=config)
-        model.to(torch_device)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_seq_classifier_loss(self, config, input_values, *args):
-        model = Data2VecAudioForSequenceClassification(config=config)
-        model.to(torch_device)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-        unmasked_loss = model(input_values, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(masked_loss, float))
-        self.parent.assertTrue(isinstance(unmasked_loss, float))
-        self.parent.assertTrue(masked_loss != unmasked_loss)
-
-    def check_ctc_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Data2VecAudioForCTC(config=config)
-        model.to(torch_device)
-        model.train()
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_seq_classifier_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Data2VecAudioForSequenceClassification(config=config)
-        model.to(torch_device)
-        model.train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_xvector_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Data2VecAudioForXVector(config=config)
-        model.to(torch_device)
-        model.train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = Data2VecAudioForCTC(config)
-        model.to(torch_device)
-        model.train()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)
-
-        with self.parent.assertRaises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class Data2VecAudioModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (Data2VecAudioForCTC, Data2VecAudioModel, Data2VecAudioForSequenceClassification)
-        if is_torch_available()
-        else ()
-    )
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = Data2VecAudioModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
-
-    def test_model_with_adapter_proj_dim(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_xvector_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_xvector_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # Data2VecAudio has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_values`
-    def test_forward_signature(self):
-        pass
-
-    # Data2VecAudio cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # Data2VecAudio has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
-    def test_model_common_attributes(self):
-        pass
-
-    @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_pt_to_flax(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_values = inputs_dict["input_values"]
-
-        input_lengths = torch.tensor(
-            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    def test_mask_feature_prob_ctc(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "patrickvonplaten/data2vec-base-960h", mask_feature_prob=0.2, mask_feature_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 299, 32))
-
-    def test_mask_time_prob_ctc(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "patrickvonplaten/data2vec-base-960h", mask_time_prob=0.2, mask_time_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 299, 32))
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = Data2VecAudioModel.from_pretrained("patrickvonplaten/data2vec-base")
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class Data2VecAudioUtilsTest(unittest.TestCase):
-    def test_compute_mask_indices(self):
-        batch_size = 4
-        sequence_length = 60
-        mask_prob = 0.5
-        mask_length = 1
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = torch.from_numpy(mask).to(torch_device)
-
-        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
-
-    def test_compute_mask_indices_low_prob(self):
-        # with these settings num_masked_spans=0.5, which means probabilistic rounding
-        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
-        # the other 5 out of 10, cases num_masked_spans=1
-        n_trials = 100
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        count_dimensions_masked = 0
-        count_dimensions_not_masked = 0
-
-        for _ in range(n_trials):
-            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-            mask = torch.from_numpy(mask).to(torch_device)
-
-            num_masks = torch.sum(mask).item()
-
-            if num_masks > 0:
-                count_dimensions_masked += 1
-            else:
-                count_dimensions_not_masked += 1
-
-        # as we test for at least 10 masked dimension and at least
-        # 10 non-masked dimension, this test could fail with probability:
-        # P(100 coin flips, at most 9 heads) = 1.66e-18
-        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
-        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
-
-    def test_compute_mask_indices_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = torch.from_numpy(mask).to(torch_device)
-
-        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-    def test_compute_mask_indices_attn_mask_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
-        attention_mask[:2, sequence_length // 2 :] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
-        )
-        mask = torch.from_numpy(mask).to(torch_device)
-
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
-
-    def test_compute_mask_indices_short_audio(self):
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
-        # force one example to be heavily padded
-        attention_mask[0, 5:] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
-        )
-
-        # make sure that non-padded examples cannot be padded
-        self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any())
-
-    def test_compute_perplexity(self):
-        probs = torch.arange(100, device=torch_device).reshape(2, 5, 10) / 100
-
-        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs)
-        self.assertTrue(abs(ppl.item() - 141.4291) < 1e-3)
-
-        # mask half of the input
-        mask = torch.ones((2,), device=torch_device, dtype=torch.bool)
-        mask[0] = 0
-
-        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs, mask)
-        self.assertTrue(abs(ppl.item() - 58.6757) < 1e-3)
-
-    def test_sample_negatives(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
-            sequence_length, hidden_size
-        )  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None)
-        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
-    def test_sample_negatives_with_mask(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        # second half of last input tensor is padded
-        mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
-        mask[-1, sequence_length // 2 :] = 0
-
-        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
-            sequence_length, hidden_size
-        )  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
-
-        # replace masked feature vectors with -100 to test that those are not sampled
-        features = torch.where(mask[:, :, None].expand(features.shape).bool(), features, -100)
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices(
-            (batch_size, sequence_length), num_negatives, mask.cpu().numpy()
-        )
-        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-
-        self.assertTrue((negatives >= 0).all().item())
-
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
-
-@require_torch
-@require_soundfile
-@slow
-class Data2VecAudioModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
-
-        return ds[:num_samples]
-
-    def test_inference_ctc_normal(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
-        model.to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
-        input_speech = self._load_datasamples(1)
-
-        input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_normal_batched(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
-        model.to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_robust_batched(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values, attention_mask=attention_mask).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
-            "his instant panic was followed by a small sharp blow high on his chest",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_keyword_spotting(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
-        input_data = self._load_superb("ks", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
-
-        expected_labels = [7, 6, 10, 9]
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor([6.1186, 11.8961, 10.2931, 6.0898], device=torch_device)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_inference_intent_classification(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("ic", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-
-        predicted_logits_action, predicted_ids_action = torch.max(outputs.logits[:, :6], dim=-1)
-        predicted_logits_object, predicted_ids_object = torch.max(outputs.logits[:, 6:20], dim=-1)
-        predicted_logits_location, predicted_ids_location = torch.max(outputs.logits[:, 20:24], dim=-1)
-
-        expected_labels_action = [0, 0, 2, 3]
-        expected_logits_action = torch.tensor([0.4568, 11.0848, 1.6621, 9.3841], device=torch_device)
-        expected_labels_object = [3, 10, 3, 4]
-        expected_logits_object = torch.tensor([1.5322, 10.7094, 5.2469, 22.1318], device=torch_device)
-        expected_labels_location = [0, 0, 0, 1]
-        expected_logits_location = torch.tensor([1.5335, 6.5096, 10.5704, 11.0569], device=torch_device)
-
-        self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
-        self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
-        self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)
-
-        self.assertTrue(torch.allclose(predicted_logits_action, expected_logits_action, atol=1e-2))
-        self.assertTrue(torch.allclose(predicted_logits_object, expected_logits_object, atol=1e-2))
-        self.assertTrue(torch.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
-
-    def test_inference_speaker_identification(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("si", 4)
-
-        output_logits = []
-        with torch.no_grad():
-            for example in input_data["speech"]:
-                input = processor(example, return_tensors="pt", padding=True)
-                output = model(input.input_values.to(torch_device), attention_mask=None)
-                output_logits.append(output.logits[0])
-        output_logits = torch.stack(output_logits)
-        predicted_logits, predicted_ids = torch.max(output_logits, dim=-1)
-
-        expected_labels = [251, 1, 1, 3]
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor([37.5627, 71.6362, 64.2419, 31.7778], device=torch_device)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_inference_emotion_recognition(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("er", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
-
-        expected_labels = [1, 1, 2, 2]
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor([2.1722, 3.0779, 8.0287, 6.6797], device=torch_device)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_phoneme_recognition(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("superb/wav2vec2-base-superb-ic")
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values, attention_mask=attention_mask).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "ɐ m æ n s ɛ d t ə ð ə j uː n ɪ v ɚ s s ɚ aɪ ɛ ɡ z ɪ s t",
-            "s w ɛ t k ʌ v ɚ d b ɹ iː ɔ n z b ɑː d i t ɹ ɪ k l ɪ ŋ ɪ n t ə ð ə t aɪ t l oɪ n k l ɑː θ ð æ w ʌ z ð ɪ oʊ n l i ɡ ɑːɹ m ə n t h iː w ɔːɹ",
-            "ð ə k aɪ t ɔ n h ɪ z tʃ ɛ s t s t ɪ l d ɹ ɪ p ɪ ŋ b l ʌ d ð ɪ eɪ k ʌ v h ɪ z oʊ v ɚ s t ɹ eɪ n d aɪ z iː v ə n ð ə s ɔːɹ ɹ ɪ ŋ ɐ ɹ iː n ɐ ɚ ɹ aʊ n d h ɪ m w ɪ ð ə θ aʊ z ə n d z ʌ v s p ɛ k t eɪ ɾ ɚ z w ɜː t ɹ ɪ v ɪ æ l ᵻ ɾ i z n ɑː t w ɜː θ θ ɪ ŋ k ɪ ŋ ɐ b aʊ t",
-            "h ɪ z ɪ n s t ə n t v p æ n ɪ k w ʌ z f ɑː l oʊ d b aɪ ɐ s m ɔː l ʃ ɑːɹ p b l oʊ h aɪ ɔ n h ɪ z tʃ ɛ s t",
-        ]
-        # should correspond to =>:
-        # [
-        # "a man said to the universe sir i exist",
-        # "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-        # "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
-        # "his instant panic was followed by a small sharp blow high on his chest",
-        # ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_diarization(self):
-        model = Data2VecAudioForAudioFrameClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("sd", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        # labels is a one-hot array of shape (num_frames, num_speakers)
-        labels = (outputs.logits > 0).long()
-
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor(
-            [
-                [[-5.2807, -5.1272], [-5.4059, -4.7757], [-5.2764, -4.9621], [-5.0117, -4.5851]],
-                [[-1.7643, -0.5462], [-1.7369, -0.2649], [-1.5066, -0.6200], [-4.5703, -2.4863]],
-                [[-0.8656, -0.4783], [-0.8899, -0.3289], [-0.9267, -0.5781], [-0.7817, -0.4619]],
-                [[-4.8625, -2.5316], [-5.2339, -2.2155], [-4.9835, -2.0344], [-4.4727, -1.8421]],
-            ],
-            device=torch_device,
-        )
-        self.assertEqual(labels[0, :, 0].sum(), 555)
-        self.assertEqual(labels[0, :, 1].sum(), 299)
-        # TODO: update the tolerance after the CI moves to torch 1.10
-        self.assertTrue(torch.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2))
-
-    def test_inference_speaker_verification(self):
-        model = Data2VecAudioForXVector.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("si", 4)
-
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
-        labels = torch.tensor([5, 1, 1, 3], device=torch_device).T
-
-        with torch.no_grad():
-            input_values = inputs.input_values.to(torch_device)
-            attention_mask = inputs.attention_mask.to(torch_device)
-            outputs = model(input_values, attention_mask=attention_mask, labels=labels)
-        embeddings = torch.nn.functional.normalize(outputs.embeddings, dim=-1).cpu()
-
-        cosine_sim = torch.nn.CosineSimilarity(dim=-1)
-        # id10002 vs id10002
-        self.assertAlmostEqual(cosine_sim(embeddings[1], embeddings[2]).numpy(), 0.9758, 3)
-        # id10006 vs id10002
-        self.assertAlmostEqual(cosine_sim(embeddings[0], embeddings[1]).numpy(), 0.7579, 3)
-        # id10002 vs id10004
-        self.assertAlmostEqual(cosine_sim(embeddings[2], embeddings[3]).numpy(), 0.7594, 3)
-
-        # TODO: update the tolerance after the CI moves to torch 1.10
-        self.assertAlmostEqual(outputs.loss.item(), 17.7963, 2)
diff --git a/tests/test_modeling_data2vec_text.py b/tests/test_modeling_data2vec_text.py
deleted file mode 100644
index 203ad1609e6b0..0000000000000
--- a/tests/test_modeling_data2vec_text.py
+++ /dev/null
@@ -1,533 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Data2VecText model. """
-
-
-import unittest
-
-from transformers import Data2VecTextConfig, is_torch_available
-from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
-
-from .test_configuration_common import ConfigTester
-from .test_generation_utils import GenerationTesterMixin
-from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        Data2VecTextForCausalLM,
-        Data2VecTextForMaskedLM,
-        Data2VecTextForMultipleChoice,
-        Data2VecTextForQuestionAnswering,
-        Data2VecTextForSequenceClassification,
-        Data2VecTextForTokenClassification,
-        Data2VecTextModel,
-    )
-    from transformers.models.data2vec.modeling_data2vec import (
-        DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
-        Data2VecTextForTextEmbeddings,
-        create_position_ids_from_input_ids,
-    )
-
-
-class Data2VecModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 5
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return Data2VecTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecTextModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Data2VecTextModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Data2VecTextForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Data2VecTextForCausalLM(config=config).to(torch_device).eval()
-
-        # make sure that ids don't start with pad token
-        mask = input_ids.ne(config.pad_token_id).long()
-        input_ids = input_ids * mask
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-
-        # make sure that ids don't start with pad token
-        mask = next_tokens.ne(config.pad_token_id).long()
-        next_tokens = next_tokens * mask
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecTextForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = Data2VecTextForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = Data2VecTextForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecTextForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class Data2VecModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            Data2VecTextForCausalLM,
-            Data2VecTextForMaskedLM,
-            Data2VecTextModel,
-            Data2VecTextForSequenceClassification,
-            Data2VecTextForTokenClassification,
-            Data2VecTextForMultipleChoice,
-            Data2VecTextForQuestionAnswering,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (Data2VecTextForCausalLM,) if is_torch_available() else ()
-
-    def setUp(self):
-        self.model_tester = Data2VecModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Data2VecTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = Data2VecTextModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    def test_create_position_ids_respects_padding_index(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = Data2VecTextForTextEmbeddings(config=config)
-
-        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = torch.as_tensor(
-            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
-        )
-
-        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-    def test_create_position_ids_from_inputs_embeds(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = Data2VecTextForTextEmbeddings(config=config)
-
-        inputs_embeds = torch.empty(2, 4, 30)
-        expected_single_positions = [
-            0 + embeddings.padding_idx + 1,
-            1 + embeddings.padding_idx + 1,
-            2 + embeddings.padding_idx + 1,
-            3 + embeddings.padding_idx + 1,
-        ]
-        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
-        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-
-@require_torch
-class Data2VecModelIntegrationTest(TestCasePlus):
-    @slow
-    def test_inference_masked_lm(self):
-        model = Data2VecTextForMaskedLM.from_pretrained("data2vec")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
-        )
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
-        # data2vec.eval()
-        # expected_slice = data2vec.model.forward(input_ids)[0][:, :3, :3].detach()
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_no_head(self):
-        model = Data2VecTextModel.from_pretrained("data2vec")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
-        )
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
-        # data2vec.eval()
-        # expected_slice = data2vec.extract_features(input_ids)[:, :3, :3].detach()
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_classification_head(self):
-        model = Data2VecTextForSequenceClassification.from_pretrained("data2vec-large-mnli")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 3))
-        self.assertEqual(output.shape, expected_shape)
-        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.large.mnli')
-        # data2vec.eval()
-        # expected_tensor = data2vec.predict("mnli", input_ids, return_logits=True).detach()
-
-        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))

From 6d8c95252cfca70204da7ef716d8d25898e6c929 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 15:55:57 +0100
Subject: [PATCH 100/121] Update utils/tests_fetcher.py

---
 utils/tests_fetcher.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 6c912141d36f0..78fb572e16e6f 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -373,7 +373,6 @@ def module_to_test_file(module_fname):
     "tests/sagemaker/test_single_node_gpu.py",  # SageMaker test
     "tests/sagemaker/test_multi_node_model_parallel.py",  # SageMaker test
     "tests/sagemaker/test_multi_node_data_parallel.py",  # SageMaker test
-    "tests/test_modeling_data2vec.py",  # TODO:Remove redundant file
 ]
 
 

From 4f22fcbbcdc699b6765cb2c8a7b02d0992ccb3a6 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 15:10:20 +0000
Subject: [PATCH 101/121] further renaming

---
 .../data2vec/configuration_data2vec_audio.py  | 42 +-----------
 .../data2vec/configuration_data2vec_text.py   |  6 +-
 .../data2vec/modeling_data2vec_audio.py       | 68 -------------------
 .../models/data2vec/modeling_data2vec_text.py | 26 +++----
 4 files changed, 19 insertions(+), 123 deletions(-)

diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py
index 61c302fd7c21d..a3a222122777e 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_audio.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -33,7 +33,7 @@ class Data2VecAudioConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`Data2VecAudioModel`]. It is used to instantiate
     an Data2VecAudio model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Data2VecAudio
-    [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h) architecture.
+    [facebook/data2vec-audio-base-960h](https://huggingface.co/facebook/data2vec-audio-base-960h) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -71,8 +71,6 @@ class Data2VecAudioConfig(PretrainedConfig):
         feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the 1D convolutional layers of the feature
             extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for quantized feature encoder states.
         conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
             A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
             feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
@@ -114,22 +112,6 @@ class Data2VecAudioConfig(PretrainedConfig):
             The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
             step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
-        num_codevectors_per_group (`int`, *optional*, defaults to 320):
-            Number of entries in each quantization codebook (group).
-        num_codevector_groups (`int`, *optional*, defaults to 2):
-            Number of codevector groups for product codevector quantization.
-        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
-            The temperature *kappa* in the contrastive loss.
-        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
-        num_negatives (`int`, *optional*, defaults to 100):
-            Number of negative samples for the contrastive loss.
-        codevector_dim (`int`, *optional*, defaults to 256):
-            Dimensionality of the quantized feature vectors.
-        proj_codevector_dim (`int`, *optional*, defaults to 256):
-            Dimensionality of the final projection of both the quantized and the transformer features.
-        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
-            The weight of the codebook diversity loss component.
         ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
             Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
             instance of [`Data2VecAudioForCTC`].
@@ -172,10 +154,10 @@ class Data2VecAudioConfig(PretrainedConfig):
     ```python
     >>> from transformers import Data2VecAudioModel, Data2VecAudioConfig
 
-    >>> # Initializing a Data2VecAudio facebook/wav2vec2-base-960h style configuration
+    >>> # Initializing a Data2VecAudio facebook/data2vec-audio-base-960h style configuration
     >>> configuration = Data2VecAudioConfig()
 
-    >>> # Initializing a model from the facebook/wav2vec2-base-960h style configuration
+    >>> # Initializing a model from the facebook/data2vec-audio-base-960h style configuration
     >>> model = Data2VecAudioModel(configuration)
 
     >>> # Accessing the model configuration
@@ -195,7 +177,6 @@ def __init__(
         activation_dropout=0.1,
         attention_dropout=0.1,
         feat_proj_dropout=0.0,
-        feat_quantizer_dropout=0.0,
         final_dropout=0.1,
         layerdrop=0.1,
         initializer_range=0.02,
@@ -214,13 +195,6 @@ def __init__(
         mask_feature_prob=0.0,
         mask_feature_length=10,
         mask_feature_min_masks=0,
-        num_codevectors_per_group=320,
-        num_codevector_groups=2,
-        contrastive_logits_temperature=0.1,
-        num_negatives=100,
-        codevector_dim=256,
-        proj_codevector_dim=256,
-        diversity_loss_weight=0.1,
         ctc_loss_reduction="sum",
         ctc_zero_infinity=False,
         use_weighted_layer_sum=False,
@@ -285,16 +259,6 @@ def __init__(
         self.mask_feature_length = mask_feature_length
         self.mask_feature_min_masks = mask_feature_min_masks
 
-        # parameters for pretraining with codevector quantized representations
-        self.num_codevectors_per_group = num_codevectors_per_group
-        self.num_codevector_groups = num_codevector_groups
-        self.contrastive_logits_temperature = contrastive_logits_temperature
-        self.feat_quantizer_dropout = feat_quantizer_dropout
-        self.num_negatives = num_negatives
-        self.codevector_dim = codevector_dim
-        self.proj_codevector_dim = proj_codevector_dim
-        self.diversity_loss_weight = diversity_loss_weight
-
         # ctc loss
         self.ctc_loss_reduction = ctc_loss_reduction
         self.ctc_zero_infinity = ctc_zero_infinity
diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py
index 3c7d6765dd314..8cf165ced3db2 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_text.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_text.py
@@ -41,8 +41,8 @@ class Data2VecTextConfig(PretrainedConfig):
 
     Args:
         vocab_size (`int`, *optional*, defaults to 30522):
-            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+            Vocabulary size of the DATA2VEC model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`Data2VecModel`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -62,7 +62,7 @@ class Data2VecTextConfig(PretrainedConfig):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+            The vocabulary size of the `token_type_ids` passed when calling [`Data2VecModel`].
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 78e071a0064ef..b66430328aa66 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -260,42 +260,6 @@ def compute_num_masked_span(input_length):
     return spec_aug_mask
 
 
-def _sample_negative_indices(
-    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
-):
-    """
-    Sample `num_negatives` vectors from feature vectors.
-    """
-    batch_size, sequence_length = features_shape
-
-    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
-    sequence_length_range = np.arange(sequence_length)
-
-    # get `num_negatives` random vector indices from the same utterance
-    sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
-
-    mask_time_indices = (
-        mask_time_indices.astype(np.bool) if mask_time_indices is not None else np.ones(features_shape, dtype=np.bool)
-    )
-
-    for batch_idx in range(batch_size):
-        high = mask_time_indices[batch_idx].sum() - 1
-        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
-
-        feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
-        sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
-        # avoid sampling the same positive vector, but keep the distribution uniform
-        sampled_indices[sampled_indices >= feature_indices] += 1
-
-        # remap to actual indices
-        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
-
-        # correct for batch size
-        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
-
-    return sampled_negative_indices
-
-
 class Data2VecAudioConvLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -636,38 +600,6 @@ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
         return outputs
 
 
-class Data2VecAudioEncoderLayerStableLayerNorm(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.attention = Data2VecAudioAttention(
-            embed_dim=config.hidden_size,
-            num_heads=config.num_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=False,
-        )
-        self.dropout = nn.Dropout(config.hidden_dropout)
-        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.feed_forward = Data2VecAudioFeedForward(config)
-        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
-        attn_residual = hidden_states
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states, attn_weights, _ = self.attention(
-            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
-        )
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = attn_residual + hidden_states
-        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs
-
-
 class Data2VecAudioEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 720e41481ffd4..2aa6db6de3fba 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -593,7 +593,7 @@ class Data2VecTextPreTrainedModel(PreTrainedModel):
     """
 
     config_class = Data2VecTextConfig
-    base_model_prefix = "data2vec"
+    base_model_prefix = "data2vec_text"
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
@@ -894,7 +894,7 @@ def __init__(self, config):
         if not config.is_decoder:
             logger.warning("If you want to use `Data2VecTextLMHeadModel` as a standalone, add `is_decoder=True.`")
 
-        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
         self.lm_head = Data2VecTextLMHead(config)
 
         # The LM head weights require special treatment only when they are tied with the word embeddings
@@ -975,7 +975,7 @@ def forward(
         if labels is not None:
             use_cache = False
 
-        outputs = self.data2vec(
+        outputs = self.data2vec_text(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1049,7 +1049,7 @@ def __init__(self, config):
                 "bi-directional self-attention."
             )
 
-        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
         self.lm_head = Data2VecTextLMHead(config)
 
         # The LM head weights require special treatment only when they are tied with the word embeddings
@@ -1097,7 +1097,7 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.data2vec(
+        outputs = self.data2vec_text(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1173,7 +1173,7 @@ def __init__(self, config):
         self.num_labels = config.num_labels
         self.config = config
 
-        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
         self.classifier = Data2VecTextClassificationHead(config)
 
         # Initialize weights and apply final processing
@@ -1207,7 +1207,7 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.data2vec(
+        outputs = self.data2vec_text(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1269,7 +1269,7 @@ class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
-        self.data2vec = Data2VecTextModel(config)
+        self.data2vec_text = Data2VecTextModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, 1)
 
@@ -1317,7 +1317,7 @@ def forward(
             else None
         )
 
-        outputs = self.data2vec(
+        outputs = self.data2vec_text(
             flat_input_ids,
             position_ids=flat_position_ids,
             token_type_ids=flat_token_type_ids,
@@ -1366,7 +1366,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
         classifier_dropout = (
             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
         )
@@ -1402,7 +1402,7 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.data2vec(
+        outputs = self.data2vec_text(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1474,7 +1474,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.data2vec = Data2VecTextModel(config, add_pooling_layer=False)
+        self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
         # Initialize weights and apply final processing
@@ -1513,7 +1513,7 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.data2vec(
+        outputs = self.data2vec_text(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,

From 45fb62f7e2724b4cc58c911b8f76e72f7e1ab4b4 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 16:58:03 +0000
Subject: [PATCH 102/121] make all tests pass

---
 src/transformers/__init__.py                  |   2 +
 src/transformers/models/auto/__init__.py      |   2 +
 ..._original_pytorch_checkpoint_to_pytorch.py |  19 +-
 .../data2vec/modeling_data2vec_audio.py       |   6 +-
 .../models/data2vec/modeling_data2vec_text.py |   6 +-
 tests/test_modeling_common.py                 |   5 +-
 tests/test_modeling_data2vec.py               | 351 ++----------------
 7 files changed, 45 insertions(+), 346 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 433769ca2f640..5b461ce4b6a2d 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -669,6 +669,7 @@
     _import_structure["models.auto"].extend(
         [
             "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
+            "MODEL_FOR_AUDIO_XVECTOR_MAPPING",
             "MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
             "MODEL_FOR_CAUSAL_LM_MAPPING",
             "MODEL_FOR_CTC_MAPPING",
@@ -2860,6 +2861,7 @@
         )
         from .models.auto import (
             MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
+            MODEL_FOR_AUDIO_XVECTOR_MAPPING,
             MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
             MODEL_FOR_CAUSAL_LM_MAPPING,
             MODEL_FOR_CTC_MAPPING,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 18701a42757cf..9a0e73fc24017 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -32,6 +32,7 @@
 if is_torch_available():
     _import_structure["modeling_auto"] = [
         "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
+        "MODEL_FOR_AUDIO_XVECTOR_MAPPING",
         "MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
         "MODEL_FOR_CAUSAL_LM_MAPPING",
         "MODEL_FOR_CTC_MAPPING",
@@ -150,6 +151,7 @@
     if is_torch_available():
         from .modeling_auto import (
             MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
+            MODEL_FOR_AUDIO_XVECTOR_MAPPING,
             MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
             MODEL_FOR_CAUSAL_LM_MAPPING,
             MODEL_FOR_CTC_MAPPING,
diff --git a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
index 5dff263d7ab55..8659e36d9f483 100644
--- a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
@@ -32,10 +32,11 @@
     BertSelfAttention,
     BertSelfOutput,
 )
-from transformers.utils import logging
 
+# IMPORTANT: In order for this script to run, please make sure to download the dictionary: `dict.txt` from wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
 # File copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
-from .data2vec_text import Data2VecTextModel
+from transformers.models.data2vec.data2vec_text import Data2VecTextModel
+from transformers.utils import logging
 
 
 if version.parse(fairseq.__version__) < version.parse("0.9.0"):
@@ -80,17 +81,17 @@ def convert_data2vec_checkpoint_to_pytorch(
 
     # Now let's copy all the weights.
     # Embeddings
-    model.data2vec.embeddings.word_embeddings.weight = data2vec_sent_encoder.embed_tokens.weight
-    model.data2vec.embeddings.position_embeddings.weight = data2vec_sent_encoder.embed_positions.weight
-    model.data2vec.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
-        model.data2vec.embeddings.token_type_embeddings.weight
+    model.data2vec_text.embeddings.word_embeddings.weight = data2vec_sent_encoder.embed_tokens.weight
+    model.data2vec_text.embeddings.position_embeddings.weight = data2vec_sent_encoder.embed_positions.weight
+    model.data2vec_text.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
+        model.data2vec_text.embeddings.token_type_embeddings.weight
     )  # just zero them out b/c data2vec doesn't use them.
-    model.data2vec.embeddings.LayerNorm.weight = data2vec_sent_encoder.layernorm_embedding.weight
-    model.data2vec.embeddings.LayerNorm.bias = data2vec_sent_encoder.layernorm_embedding.bias
+    model.data2vec_text.embeddings.LayerNorm.weight = data2vec_sent_encoder.layernorm_embedding.weight
+    model.data2vec_text.embeddings.LayerNorm.bias = data2vec_sent_encoder.layernorm_embedding.bias
 
     for i in range(config.num_hidden_layers):
         # Encoder: start of layer
-        layer: BertLayer = model.data2vec.encoder.layer[i]
+        layer: BertLayer = model.data2vec_text.encoder.layer[i]
         data2vec_layer: TransformerSentenceEncoderLayer = data2vec_sent_encoder.layers[i]
 
         # self attention
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index b66430328aa66..90b8dce841e1d 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -72,10 +72,10 @@
 
 
 DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/data2vec-audio-base",
+    "facebook/data2vec-audio-base-10m",
+    "facebook/data2vec-audio-base-100h",
     "facebook/data2vec-audio-base-960h",
-    "facebook/data2vec-audio-large-960h",
-    "facebook/data2vec-audio-large-960h-lv60",
-    "facebook/data2vec-audio-large-960h-lv60-self",
     # See all Data2VecAudio models at https://huggingface.co/models?filter=data2vec-audio
 ]
 
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 2aa6db6de3fba..ecb867d637617 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -54,7 +54,7 @@
 
 _HIDDEN_STATES_START_POSITION = 2
 
-_CHECKPOINT_FOR_DOC = "data2vec"
+_CHECKPOINT_FOR_DOC = "facebook/data2vec-text-base"
 _CONFIG_FOR_DOC = "Data2VecTextConfig"
 _TOKENIZER_FOR_DOC = "RobertaTokenizer"
 
@@ -67,8 +67,8 @@
 _CTC_EXPECTED_LOSS = 53.48
 
 DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "data2vec",
-    # See all data2vec models at https://huggingface.co/models?filter=data2vec
+    "facebook/data2vec-text-base",
+    # See all data2vec models at https://huggingface.co/models?filter=data2vec-text
 ]
 
 
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 21ee53a824b0d..6e2c9d1d679c5 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -69,6 +69,7 @@
     from test_module.custom_modeling import CustomModel, NoSuperInitModel
     from transformers import (
         BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        MODEL_FOR_AUDIO_XVECTOR_MAPPING,
         MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
         MODEL_FOR_CAUSAL_LM_MAPPING,
         MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
@@ -137,6 +138,8 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 else v
                 for k, v in inputs_dict.items()
             }
+        elif model_class in get_values(MODEL_FOR_AUDIO_XVECTOR_MAPPING):
+            inputs_dict.pop("attention_mask")
 
         if return_labels:
             if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
@@ -171,6 +174,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 inputs_dict["bool_masked_pos"] = torch.zeros(
                     (self.model_tester.batch_size, num_patches**2), dtype=torch.long, device=torch_device
                 )
+
         return inputs_dict
 
     def test_save_load(self):
@@ -371,7 +375,6 @@ def test_initialization(self):
 
     def test_determinism(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
         for model_class in self.all_model_classes:
             model = model_class(config)
             model.to(torch_device)
diff --git a/tests/test_modeling_data2vec.py b/tests/test_modeling_data2vec.py
index 15ceb29ff9e0f..01dda64c5a1b1 100644
--- a/tests/test_modeling_data2vec.py
+++ b/tests/test_modeling_data2vec.py
@@ -52,19 +52,14 @@
         Data2VecTextForSequenceClassification,
         Data2VecTextForTokenClassification,
         Data2VecTextModel,
-        Wav2Vec2FeatureExtractor,
         Wav2Vec2Processor,
     )
+    from transformers.models.data2vec.modeling_data2vec_audio import _compute_mask_indices
     from transformers.models.data2vec.modeling_data2vec_text import (
         DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
         Data2VecTextForTextEmbeddings,
         create_position_ids_from_input_ids,
     )
-    from transformers.models.wav2vec2.modeling_wav2vec2 import (
-        Wav2Vec2GumbelVectorQuantizer,
-        _compute_mask_indices,
-        _sample_negative_indices,
-    )
 
 
 class Data2VecTextModelTester:
@@ -502,7 +497,7 @@ def test_create_position_ids_from_inputs_embeds(self):
 class Data2VecTextModelIntegrationTest(TestCasePlus):
     @slow
     def test_inference_masked_lm(self):
-        model = Data2VecTextForMaskedLM.from_pretrained("data2vec")
+        model = Data2VecTextForMaskedLM.from_pretrained("facebook/data2vec-text-base")
 
         input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         with torch.no_grad():
@@ -510,51 +505,24 @@ def test_inference_masked_lm(self):
         expected_shape = torch.Size((1, 11, 50265))
         self.assertEqual(output.shape, expected_shape)
         # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
-        )
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
-        # data2vec.eval()
-        # expected_slice = data2vec.model.forward(input_ids)[0][:, :3, :3].detach()
+        expected_slice = torch.tensor([[[0.2328, 0.0000, 1.1710], [2.2525, 0.0000, 1.9937], [2.1280, 0.0000, 1.8691]]])
 
         self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
 
     @slow
     def test_inference_no_head(self):
-        model = Data2VecTextModel.from_pretrained("data2vec")
+        model = Data2VecTextModel.from_pretrained("facebook/data2vec-text-base")
 
         input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         with torch.no_grad():
             output = model(input_ids)[0]
         # compare the actual values for a slice.
         expected_slice = torch.tensor(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
+            [[[0.1998, -0.0379, 0.0024], [-0.0971, -0.2214, -0.1798], [-0.0789, -0.2400, -0.1898]]]
         )
 
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.base')
-        # data2vec.eval()
-        # expected_slice = data2vec.extract_features(input_ids)[:, :3, :3].detach()
-
         self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
 
-    @slow
-    def test_inference_classification_head(self):
-        model = Data2VecTextForSequenceClassification.from_pretrained("data2vec-large-mnli")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 3))
-        self.assertEqual(output.shape, expected_shape)
-        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
-
-        # data2vec = torch.hub.load('pytorch/fairseq', 'data2vec.large.mnli')
-        # data2vec.eval()
-        # expected_tensor = data2vec.predict("mnli", input_ids, return_logits=True).detach()
-
-        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
-
 
 class Data2VecAudioModelTester:
     def __init__(
@@ -874,7 +842,13 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class Data2VecAudioModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (Data2VecAudioForCTC, Data2VecAudioModel, Data2VecAudioForSequenceClassification)
+        (
+            Data2VecAudioForCTC,
+            Data2VecAudioModel,
+            Data2VecAudioForSequenceClassification,
+            Data2VecAudioForAudioFrameClassification,
+            Data2VecAudioForXVector,
+        )
         if is_torch_available()
         else ()
     )
@@ -1044,7 +1018,7 @@ def _mock_init_weights(self, module):
 
     def test_mask_feature_prob_ctc(self):
         model = Data2VecAudioForCTC.from_pretrained(
-            "patrickvonplaten/data2vec-base-960h", mask_feature_prob=0.2, mask_feature_length=2
+            "facebook/data2vec-audio-base-960h", mask_feature_prob=0.2, mask_feature_length=2
         )
         model.to(torch_device).train()
         processor = Wav2Vec2Processor.from_pretrained(
@@ -1067,7 +1041,7 @@ def test_mask_feature_prob_ctc(self):
 
     def test_mask_time_prob_ctc(self):
         model = Data2VecAudioForCTC.from_pretrained(
-            "patrickvonplaten/data2vec-base-960h", mask_time_prob=0.2, mask_time_length=2
+            "facebook/data2vec-audio-base-960h", mask_time_prob=0.2, mask_time_length=2
         )
         model.to(torch_device).train()
         processor = Wav2Vec2Processor.from_pretrained(
@@ -1094,7 +1068,7 @@ def test_feed_forward_chunking(self):
 
     @slow
     def test_model_from_pretrained(self):
-        model = Data2VecAudioModel.from_pretrained("patrickvonplaten/data2vec-base")
+        model = Data2VecAudioModel.from_pretrained("facebook/data2vec-audio-base")
         self.assertIsNotNone(model)
 
 
@@ -1190,81 +1164,6 @@ def test_compute_mask_indices_short_audio(self):
         # make sure that non-padded examples cannot be padded
         self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any())
 
-    def test_compute_perplexity(self):
-        probs = torch.arange(100, device=torch_device).reshape(2, 5, 10) / 100
-
-        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs)
-        self.assertTrue(abs(ppl.item() - 141.4291) < 1e-3)
-
-        # mask half of the input
-        mask = torch.ones((2,), device=torch_device, dtype=torch.bool)
-        mask[0] = 0
-
-        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs, mask)
-        self.assertTrue(abs(ppl.item() - 58.6757) < 1e-3)
-
-    def test_sample_negatives(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
-            sequence_length, hidden_size
-        )  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None)
-        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
-    def test_sample_negatives_with_mask(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        # second half of last input tensor is padded
-        mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
-        mask[-1, sequence_length // 2 :] = 0
-
-        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
-            sequence_length, hidden_size
-        )  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
-
-        # replace masked feature vectors with -100 to test that those are not sampled
-        features = torch.where(mask[:, :, None].expand(features.shape).bool(), features, -100)
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices(
-            (batch_size, sequence_length), num_negatives, mask.cpu().numpy()
-        )
-        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-
-        self.assertTrue((negatives >= 0).all().item())
-
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
 
 @require_torch
 @require_soundfile
@@ -1285,7 +1184,7 @@ def _load_superb(self, task, num_samples):
         return ds[:num_samples]
 
     def test_inference_ctc_normal(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
+        model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h")
         model.to(torch_device)
         processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
         input_speech = self._load_datasamples(1)
@@ -1301,31 +1200,8 @@ def test_inference_ctc_normal(self):
         EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
         self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
 
-    def test_inference_ctc_normal_batched(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h")
-        model.to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_robust_batched(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
+    def test_inference_ctc_batched(self):
+        model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h").to(torch_device)
         processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
 
         input_speech = self._load_datasamples(4)
@@ -1333,10 +1209,9 @@ def test_inference_ctc_robust_batched(self):
         inputs = processor(input_speech, return_tensors="pt", padding=True)
 
         input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
 
         with torch.no_grad():
-            logits = model(input_values, attention_mask=attention_mask).logits
+            logits = model(input_values).logits
 
         predicted_ids = torch.argmax(logits, dim=-1)
         predicted_trans = processor.batch_decode(predicted_ids)
@@ -1344,191 +1219,7 @@ def test_inference_ctc_robust_batched(self):
         EXPECTED_TRANSCRIPTIONS = [
             "a man said to the universe sir i exist",
             "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
-            "his instant panic was followed by a small sharp blow high on his chest",
+            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with thousands of spectators were trivialities not worth thinking about",
+            "his instant of panic was followed by a small sharp blow high on his chest",
         ]
         self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_keyword_spotting(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
-        input_data = self._load_superb("ks", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
-
-        expected_labels = [7, 6, 10, 9]
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor([6.1186, 11.8961, 10.2931, 6.0898], device=torch_device)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_inference_intent_classification(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("ic", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-
-        predicted_logits_action, predicted_ids_action = torch.max(outputs.logits[:, :6], dim=-1)
-        predicted_logits_object, predicted_ids_object = torch.max(outputs.logits[:, 6:20], dim=-1)
-        predicted_logits_location, predicted_ids_location = torch.max(outputs.logits[:, 20:24], dim=-1)
-
-        expected_labels_action = [0, 0, 2, 3]
-        expected_logits_action = torch.tensor([0.4568, 11.0848, 1.6621, 9.3841], device=torch_device)
-        expected_labels_object = [3, 10, 3, 4]
-        expected_logits_object = torch.tensor([1.5322, 10.7094, 5.2469, 22.1318], device=torch_device)
-        expected_labels_location = [0, 0, 0, 1]
-        expected_logits_location = torch.tensor([1.5335, 6.5096, 10.5704, 11.0569], device=torch_device)
-
-        self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
-        self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
-        self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)
-
-        self.assertTrue(torch.allclose(predicted_logits_action, expected_logits_action, atol=1e-2))
-        self.assertTrue(torch.allclose(predicted_logits_object, expected_logits_object, atol=1e-2))
-        self.assertTrue(torch.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
-
-    def test_inference_speaker_identification(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("si", 4)
-
-        output_logits = []
-        with torch.no_grad():
-            for example in input_data["speech"]:
-                input = processor(example, return_tensors="pt", padding=True)
-                output = model(input.input_values.to(torch_device), attention_mask=None)
-                output_logits.append(output.logits[0])
-        output_logits = torch.stack(output_logits)
-        predicted_logits, predicted_ids = torch.max(output_logits, dim=-1)
-
-        expected_labels = [251, 1, 1, 3]
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor([37.5627, 71.6362, 64.2419, 31.7778], device=torch_device)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_inference_emotion_recognition(self):
-        model = Data2VecAudioForSequenceClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("er", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
-
-        expected_labels = [1, 1, 2, 2]
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor([2.1722, 3.0779, 8.0287, 6.6797], device=torch_device)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_phoneme_recognition(self):
-        model = Data2VecAudioForCTC.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("superb/wav2vec2-base-superb-ic")
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values, attention_mask=attention_mask).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "ɐ m æ n s ɛ d t ə ð ə j uː n ɪ v ɚ s s ɚ aɪ ɛ ɡ z ɪ s t",
-            "s w ɛ t k ʌ v ɚ d b ɹ iː ɔ n z b ɑː d i t ɹ ɪ k l ɪ ŋ ɪ n t ə ð ə t aɪ t l oɪ n k l ɑː θ ð æ w ʌ z ð ɪ oʊ n l i ɡ ɑːɹ m ə n t h iː w ɔːɹ",
-            "ð ə k aɪ t ɔ n h ɪ z tʃ ɛ s t s t ɪ l d ɹ ɪ p ɪ ŋ b l ʌ d ð ɪ eɪ k ʌ v h ɪ z oʊ v ɚ s t ɹ eɪ n d aɪ z iː v ə n ð ə s ɔːɹ ɹ ɪ ŋ ɐ ɹ iː n ɐ ɚ ɹ aʊ n d h ɪ m w ɪ ð ə θ aʊ z ə n d z ʌ v s p ɛ k t eɪ ɾ ɚ z w ɜː t ɹ ɪ v ɪ æ l ᵻ ɾ i z n ɑː t w ɜː θ θ ɪ ŋ k ɪ ŋ ɐ b aʊ t",
-            "h ɪ z ɪ n s t ə n t v p æ n ɪ k w ʌ z f ɑː l oʊ d b aɪ ɐ s m ɔː l ʃ ɑːɹ p b l oʊ h aɪ ɔ n h ɪ z tʃ ɛ s t",
-        ]
-        # should correspond to =>:
-        # [
-        # "a man said to the universe sir i exist",
-        # "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-        # "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
-        # "his instant panic was followed by a small sharp blow high on his chest",
-        # ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_diarization(self):
-        model = Data2VecAudioForAudioFrameClassification.from_pretrained("patrickvonplaten/data2vec-base-960h").to(
-            torch_device
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("sd", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        # labels is a one-hot array of shape (num_frames, num_speakers)
-        labels = (outputs.logits > 0).long()
-
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor(
-            [
-                [[-5.2807, -5.1272], [-5.4059, -4.7757], [-5.2764, -4.9621], [-5.0117, -4.5851]],
-                [[-1.7643, -0.5462], [-1.7369, -0.2649], [-1.5066, -0.6200], [-4.5703, -2.4863]],
-                [[-0.8656, -0.4783], [-0.8899, -0.3289], [-0.9267, -0.5781], [-0.7817, -0.4619]],
-                [[-4.8625, -2.5316], [-5.2339, -2.2155], [-4.9835, -2.0344], [-4.4727, -1.8421]],
-            ],
-            device=torch_device,
-        )
-        self.assertEqual(labels[0, :, 0].sum(), 555)
-        self.assertEqual(labels[0, :, 1].sum(), 299)
-        # TODO: update the tolerance after the CI moves to torch 1.10
-        self.assertTrue(torch.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2))
-
-    def test_inference_speaker_verification(self):
-        model = Data2VecAudioForXVector.from_pretrained("patrickvonplaten/data2vec-base-960h").to(torch_device)
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("si", 4)
-
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
-        labels = torch.tensor([5, 1, 1, 3], device=torch_device).T
-
-        with torch.no_grad():
-            input_values = inputs.input_values.to(torch_device)
-            attention_mask = inputs.attention_mask.to(torch_device)
-            outputs = model(input_values, attention_mask=attention_mask, labels=labels)
-        embeddings = torch.nn.functional.normalize(outputs.embeddings, dim=-1).cpu()
-
-        cosine_sim = torch.nn.CosineSimilarity(dim=-1)
-        # id10002 vs id10002
-        self.assertAlmostEqual(cosine_sim(embeddings[1], embeddings[2]).numpy(), 0.9758, 3)
-        # id10006 vs id10002
-        self.assertAlmostEqual(cosine_sim(embeddings[0], embeddings[1]).numpy(), 0.7579, 3)
-        # id10002 vs id10004
-        self.assertAlmostEqual(cosine_sim(embeddings[2], embeddings[3]).numpy(), 0.7594, 3)
-
-        # TODO: update the tolerance after the CI moves to torch 1.10
-        self.assertAlmostEqual(outputs.loss.item(), 17.7963, 2)

From 8cde36ac4b62a7f2b0db719ea91ca91f167ed979 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 17:12:01 +0000
Subject: [PATCH 103/121] finish

---
 docs/source/index.mdx                      |   3 +-
 docs/source/serialization.mdx              |   2 +-
 src/transformers/__init__.py               |   6 +-
 src/transformers/utils/dummy_pt_objects.py |   8 +-
 tests/test_modeling_data2vec_audio.py      | 743 +++++++++++++++++++++
 tests/test_modeling_data2vec_text.py       | 506 ++++++++++++++
 utils/check_repo.py                        |  12 +
 7 files changed, 1275 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_modeling_data2vec_audio.py
 create mode 100644 tests/test_modeling_data2vec_text.py

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index c4a42a4de4192..44a53a6e93774 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -181,7 +181,8 @@ Flax), PyTorch, and/or TensorFlow.
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNext           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          Data2Vec           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/serialization.mdx b/docs/source/serialization.mdx
index c77a77114e5f8..5eff7580d9b40 100644
--- a/docs/source/serialization.mdx
+++ b/docs/source/serialization.mdx
@@ -49,7 +49,7 @@ Ready-made configurations include the following architectures:
 - BART
 - BERT
 - CamemBERT
-- Data2Vec
+- Data2VecText
 - DistilBERT
 - ELECTRA
 - GPT Neo
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5b461ce4b6a2d..70e927f0a04ac 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -875,7 +875,8 @@
     )
     _import_structure["models.data2vec"].extend(
         [
-            "DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
             "Data2VecAudioForAudioFrameClassification",
             "Data2VecAudioForCTC",
             "Data2VecAudioForSequenceClassification",
@@ -3037,7 +3038,8 @@
             CTRLPreTrainedModel,
         )
         from .models.data2vec import (
-            DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
             Data2VecAudioForAudioFrameClassification,
             Data2VecAudioForCTC,
             Data2VecAudioForSequenceClassification,
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 3bce039f60cfb..71587cba3bc9f 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -347,6 +347,9 @@ def load_tf_weights_in_albert(*args, **kwargs):
 MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = None
 
 
+MODEL_FOR_AUDIO_XVECTOR_MAPPING = None
+
+
 MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING = None
 
 
@@ -1211,7 +1214,10 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-DATA2VEC_PRETRAINED_MODEL_ARCHIVE_LIST = None
+DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
 class Data2VecAudioForAudioFrameClassification(metaclass=DummyObject):
diff --git a/tests/test_modeling_data2vec_audio.py b/tests/test_modeling_data2vec_audio.py
new file mode 100644
index 0000000000000..4797d5365bc21
--- /dev/null
+++ b/tests/test_modeling_data2vec_audio.py
@@ -0,0 +1,743 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Data2VecAudio model. """
+
+import math
+import unittest
+
+import numpy as np
+from datasets import load_dataset
+
+from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
+from transformers import Data2VecAudioConfig, is_torch_available
+from transformers.testing_utils import is_pt_flax_cross_test, require_soundfile, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, _config_zero_init
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Data2VecAudioForAudioFrameClassification,
+        Data2VecAudioForCTC,
+        Data2VecAudioForSequenceClassification,
+        Data2VecAudioForXVector,
+        Data2VecAudioModel,
+        Wav2Vec2Processor,
+    )
+    from transformers.models.data2vec.modeling_data2vec_audio import _compute_mask_indices
+
+
+class Data2VecAudioModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=1024,  # speech is longer
+        is_training=False,
+        hidden_size=16,
+        feat_extract_dropout=0.0,
+        feat_extract_activation="gelu",
+        conv_dim=(32, 32, 32),
+        conv_stride=(4, 4, 4),
+        conv_kernel=(8, 8, 8),
+        conv_bias=False,
+        num_conv_pos_embeddings=16,
+        num_conv_pos_embedding_groups=2,
+        num_hidden_layers=4,
+        num_attention_heads=2,
+        hidden_dropout_prob=0.1,
+        intermediate_size=20,
+        layer_norm_eps=1e-5,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        mask_time_prob=0.5,
+        mask_time_length=2,
+        vocab_size=32,
+        num_adapter_layers=1,
+        adapter_stride=2,
+        tdnn_dim=(32, 32),
+        tdnn_kernel=(5, 3),
+        tdnn_dilation=(1, 2),
+        xvector_output_dim=32,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.feat_extract_dropout = feat_extract_dropout
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = conv_dim
+        self.conv_stride = conv_stride
+        self.conv_kernel = conv_kernel
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.intermediate_size = intermediate_size
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.num_adapter_layers = num_adapter_layers
+        self.adapter_stride = adapter_stride
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.scope = scope
+        self.tdnn_dim = tdnn_dim
+        self.tdnn_kernel = tdnn_kernel
+        self.tdnn_dilation = tdnn_dilation
+        self.xvector_output_dim = xvector_output_dim
+
+        output_seq_length = self.seq_length
+        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
+            output_seq_length = (output_seq_length - (kernel - 1)) / stride
+        self.output_seq_length = int(math.ceil(output_seq_length))
+        self.encoder_seq_length = self.output_seq_length
+
+        self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1
+
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+
+        return config, input_values, attention_mask
+
+    def get_config(self):
+        return Data2VecAudioConfig(
+            hidden_size=self.hidden_size,
+            feat_extract_dropout=self.feat_extract_dropout,
+            feat_extract_activation=self.feat_extract_activation,
+            conv_dim=self.conv_dim,
+            conv_stride=self.conv_stride,
+            conv_kernel=self.conv_kernel,
+            conv_bias=self.conv_bias,
+            mask_time_prob=self.mask_time_prob,
+            mask_time_length=self.mask_time_length,
+            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
+            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            intermediate_size=self.intermediate_size,
+            layer_norm_eps=self.layer_norm_eps,
+            hidden_act=self.hidden_act,
+            initializer_range=self.initializer_range,
+            vocab_size=self.vocab_size,
+            num_adapter_layers=self.num_adapter_layers,
+            adapter_stride=self.adapter_stride,
+            tdnn_dim=self.tdnn_dim,
+            tdnn_kernel=self.tdnn_kernel,
+            tdnn_dilation=self.tdnn_dilation,
+            xvector_output_dim=self.xvector_output_dim,
+        )
+
+    def create_and_check_model(self, config, input_values, attention_mask):
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
+        config.add_adapter = True
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask):
+        config.add_adapter = True
+        config.output_hidden_size = 8
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
+        )
+
+    def create_and_check_batch_inference(self, config, input_values, *args):
+        # test does not pass for models making use of `group_norm`
+        # check: https://github.com/pytorch/fairseq/issues/3227
+        model = Data2VecAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0.0
+
+        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
+
+        for i in range(input_values.shape[0]):
+            input_slice = input_values[i : i + 1, : input_lengths[i]]
+            output = model(input_slice).last_hidden_state
+
+            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
+            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
+
+    def check_ctc_loss(self, config, input_values, *args):
+        model = Data2VecAudioForCTC(config=config)
+        model.to(torch_device)
+
+        # make sure that dropout is disabled
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0
+
+        model.config.ctc_loss_reduction = "sum"
+        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+
+        model.config.ctc_loss_reduction = "mean"
+        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+
+        self.parent.assertTrue(isinstance(sum_loss, float))
+        self.parent.assertTrue(isinstance(mean_loss, float))
+
+    def check_seq_classifier_loss(self, config, input_values, *args):
+        model = Data2VecAudioForSequenceClassification(config=config)
+        model.to(torch_device)
+
+        # make sure that dropout is disabled
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0
+
+        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
+        unmasked_loss = model(input_values, labels=labels).loss.item()
+
+        self.parent.assertTrue(isinstance(masked_loss, float))
+        self.parent.assertTrue(isinstance(unmasked_loss, float))
+        self.parent.assertTrue(masked_loss != unmasked_loss)
+
+    def check_ctc_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForCTC(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze feature encoder
+        model.freeze_feature_encoder()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+            if max_length_labels[i] < labels.shape[-1]:
+                # it's important that we make sure that target lenghts are at least
+                # one shorter than logit lenghts to prevent -inf
+                labels[i, max_length_labels[i] - 1 :] = -100
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_seq_classifier_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForSequenceClassification(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze everything but the classification head
+        model.freeze_base_model()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_xvector_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Data2VecAudioForXVector(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze everything but the classification head
+        model.freeze_base_model()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def check_labels_out_of_vocab(self, config, input_values, *args):
+        model = Data2VecAudioForCTC(config)
+        model.to(torch_device)
+        model.train()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)
+
+        with self.parent.assertRaises(ValueError):
+            model(input_values, labels=labels)
+
+    def prepare_config_and_inputs_for_common(self):
+        config, input_values, attention_mask = self.prepare_config_and_inputs()
+        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Data2VecAudioModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            Data2VecAudioForCTC,
+            Data2VecAudioModel,
+            Data2VecAudioForSequenceClassification,
+            Data2VecAudioForAudioFrameClassification,
+            Data2VecAudioForXVector,
+        )
+        if is_torch_available()
+        else ()
+    )
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = Data2VecAudioModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_adapter(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
+
+    def test_model_with_adapter_proj_dim(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
+
+    def test_ctc_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_loss(*config_and_inputs)
+
+    def test_seq_classifier_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
+
+    def test_ctc_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_training(*config_and_inputs)
+
+    def test_seq_classifier_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_seq_classifier_training(*config_and_inputs)
+
+    def test_xvector_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_xvector_training(*config_and_inputs)
+
+    def test_labels_out_of_vocab(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
+
+    # Data2VecAudio has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # `input_ids` is renamed to `input_values`
+    def test_forward_signature(self):
+        pass
+
+    # Data2VecAudio cannot resize token embeddings
+    # since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # Data2VecAudio has no inputs_embeds
+    # and thus the `get_input_embeddings` fn
+    # is not implemented
+    def test_model_common_attributes(self):
+        pass
+
+    @is_pt_flax_cross_test
+    # non-robust architecture does not exist in Flax
+    def test_equivalence_flax_to_pt(self):
+        pass
+
+    @is_pt_flax_cross_test
+    # non-robust architecture does not exist in Flax
+    def test_equivalence_pt_to_flax(self):
+        pass
+
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        # set layer drop to 0
+        model.config.layerdrop = 0.0
+
+        input_values = inputs_dict["input_values"]
+
+        input_lengths = torch.tensor(
+            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
+        )
+        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
+
+        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
+        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
+        inputs_dict["labels"] = labels
+
+        outputs = model(**inputs_dict)
+
+        output = outputs[0]
+
+        # Encoder-/Decoder-only models
+        hidden_states = outputs.hidden_states[0]
+        attentions = outputs.attentions[0]
+
+        hidden_states.retain_grad()
+        attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+        self.assertIsNotNone(attentions.grad)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = [
+                    "conv.weight",
+                    "masked_spec_embed",
+                    "codevectors",
+                    "quantizer.weight_proj.weight",
+                    "project_hid.weight",
+                    "project_hid.bias",
+                    "project_q.weight",
+                    "project_q.bias",
+                    "feature_projection.projection.weight",
+                    "feature_projection.projection.bias",
+                    "objective.weight",
+                ]
+                if param.requires_grad:
+                    if any([x in name for x in uniform_init_parms]):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.data.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+        if hasattr(module, "codevectors") and module.codevectors is not None:
+            module.codevectors.data.fill_(3)
+        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
+            module.masked_spec_embed.data.fill_(3)
+
+    def test_mask_feature_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "facebook/data2vec-audio-base-960h", mask_feature_prob=0.2, mask_feature_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    def test_mask_time_prob_ctc(self):
+        model = Data2VecAudioForCTC.from_pretrained(
+            "facebook/data2vec-audio-base-960h", mask_time_prob=0.2, mask_time_length=2
+        )
+        model.to(torch_device).train()
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        batch = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
+        logits = model(
+            input_values=batch["input_values"].to(torch_device),
+            attention_mask=batch["attention_mask"].to(torch_device),
+        ).logits
+
+        self.assertEqual(logits.shape, (4, 299, 32))
+
+    @unittest.skip(reason="Feed forward chunking is not implemented")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = Data2VecAudioModel.from_pretrained("facebook/data2vec-audio-base")
+        self.assertIsNotNone(model)
+
+
+@require_torch
+class Data2VecAudioUtilsTest(unittest.TestCase):
+    def test_compute_mask_indices(self):
+        batch_size = 4
+        sequence_length = 60
+        mask_prob = 0.5
+        mask_length = 1
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
+
+    def test_compute_mask_indices_low_prob(self):
+        # with these settings num_masked_spans=0.5, which means probabilistic rounding
+        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
+        # the other 5 out of 10, cases num_masked_spans=1
+        n_trials = 100
+        batch_size = 4
+        sequence_length = 100
+        mask_prob = 0.05
+        mask_length = 10
+
+        count_dimensions_masked = 0
+        count_dimensions_not_masked = 0
+
+        for _ in range(n_trials):
+            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+            mask = torch.from_numpy(mask).to(torch_device)
+
+            num_masks = torch.sum(mask).item()
+
+            if num_masks > 0:
+                count_dimensions_masked += 1
+            else:
+                count_dimensions_not_masked += 1
+
+        # as we test for at least 10 masked dimension and at least
+        # 10 non-masked dimension, this test could fail with probability:
+        # P(100 coin flips, at most 9 heads) = 1.66e-18
+        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
+        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
+
+    def test_compute_mask_indices_overlap(self):
+        batch_size = 4
+        sequence_length = 80
+        mask_prob = 0.5
+        mask_length = 4
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
+        for batch_sum in mask.sum(axis=-1):
+            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
+
+    def test_compute_mask_indices_attn_mask_overlap(self):
+        batch_size = 4
+        sequence_length = 80
+        mask_prob = 0.5
+        mask_length = 4
+
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        attention_mask[:2, sequence_length // 2 :] = 0
+
+        mask = _compute_mask_indices(
+            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
+        )
+        mask = torch.from_numpy(mask).to(torch_device)
+
+        for batch_sum in mask.sum(axis=-1):
+            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
+
+        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
+
+    def test_compute_mask_indices_short_audio(self):
+        batch_size = 4
+        sequence_length = 100
+        mask_prob = 0.05
+        mask_length = 10
+
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
+        # force one example to be heavily padded
+        attention_mask[0, 5:] = 0
+
+        mask = _compute_mask_indices(
+            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
+        )
+
+        # make sure that non-padded examples cannot be padded
+        self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any())
+
+
+@require_torch
+@require_soundfile
+@slow
+class Data2VecAudioModelIntegrationTest(unittest.TestCase):
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").filter(
+            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
+        )[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def _load_superb(self, task, num_samples):
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+
+        return ds[:num_samples]
+
+    def test_inference_ctc_normal(self):
+        model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h")
+        model.to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+        input_speech = self._load_datasamples(1)
+
+        input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_ctc_batched(self):
+        model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h").to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
+
+        input_speech = self._load_datasamples(4)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "a man said to the universe sir i exist",
+            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
+            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with thousands of spectators were trivialities not worth thinking about",
+            "his instant of panic was followed by a small sharp blow high on his chest",
+        ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
diff --git a/tests/test_modeling_data2vec_text.py b/tests/test_modeling_data2vec_text.py
new file mode 100644
index 0000000000000..45dba500601b4
--- /dev/null
+++ b/tests/test_modeling_data2vec_text.py
@@ -0,0 +1,506 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Data2VecAudio model. """
+
+import unittest
+
+from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
+from transformers import Data2VecTextConfig, is_torch_available
+from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Data2VecTextForCausalLM,
+        Data2VecTextForMaskedLM,
+        Data2VecTextForMultipleChoice,
+        Data2VecTextForQuestionAnswering,
+        Data2VecTextForSequenceClassification,
+        Data2VecTextForTokenClassification,
+        Data2VecTextModel,
+    )
+    from transformers.models.data2vec.modeling_data2vec_text import (
+        DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        Data2VecTextForTextEmbeddings,
+        create_position_ids_from_input_ids,
+    )
+
+
+class Data2VecTextModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return Data2VecTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = Data2VecTextModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = Data2VecTextForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = Data2VecTextForCausalLM(config=config).to(torch_device).eval()
+
+        # make sure that ids don't start with pad token
+        mask = input_ids.ne(config.pad_token_id).long()
+        input_ids = input_ids * mask
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+
+        # make sure that ids don't start with pad token
+        mask = next_tokens.ne(config.pad_token_id).long()
+        next_tokens = next_tokens * mask
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = Data2VecTextForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = Data2VecTextForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Data2VecTextForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Data2VecTextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            Data2VecTextForCausalLM,
+            Data2VecTextForMaskedLM,
+            Data2VecTextModel,
+            Data2VecTextForSequenceClassification,
+            Data2VecTextForTokenClassification,
+            Data2VecTextForMultipleChoice,
+            Data2VecTextForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (Data2VecTextForCausalLM,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = Data2VecTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Data2VecTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = Data2VecTextModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_create_position_ids_respects_padding_index(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = Data2VecTextForTextEmbeddings(config=config)
+
+        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
+        expected_positions = torch.as_tensor(
+            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
+        )
+
+        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+    def test_create_position_ids_from_inputs_embeds(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        embeddings = Data2VecTextForTextEmbeddings(config=config)
+
+        inputs_embeds = torch.empty(2, 4, 30)
+        expected_single_positions = [
+            0 + embeddings.padding_idx + 1,
+            1 + embeddings.padding_idx + 1,
+            2 + embeddings.padding_idx + 1,
+            3 + embeddings.padding_idx + 1,
+        ]
+        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
+        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+
+@require_torch
+class Data2VecTextModelIntegrationTest(TestCasePlus):
+    @slow
+    def test_inference_masked_lm(self):
+        model = Data2VecTextForMaskedLM.from_pretrained("facebook/data2vec-text-base")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 50265))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor([[[0.2328, 0.0000, 1.1710], [2.2525, 0.0000, 1.9937], [2.1280, 0.0000, 1.8691]]])
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_no_head(self):
+        model = Data2VecTextModel.from_pretrained("facebook/data2vec-text-base")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[0.1998, -0.0379, 0.0024], [-0.0971, -0.2214, -0.1798], [-0.0789, -0.2400, -0.1898]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 6ea2b0a839791..b9186a0c64dba 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -18,6 +18,7 @@
 import os
 import re
 import warnings
+from collections import OrderedDict
 from difflib import get_close_matches
 from pathlib import Path
 
@@ -171,6 +172,16 @@
     "Data2VecAudioModel",  # TODO: Fix
 ]
 
+# Update this list for models that have multiple model types for the same
+# model doc
+MODEL_TYPE_TO_DOC_MAPPING = OrderedDict(
+    [
+        ("data2vec-text", "data2vec"),
+        ("data2vec-audio", "data2vec"),
+    ]
+)
+
+
 # This is to make sure the transformers module imported is the one in the repo.
 spec = importlib.util.spec_from_file_location(
     "transformers",
@@ -640,6 +651,7 @@ def check_model_type_doc_match():
     model_docs = [m.stem for m in model_doc_folder.glob("*.mdx")]
 
     model_types = list(transformers.models.auto.configuration_auto.MODEL_NAMES_MAPPING.keys())
+    model_types = [MODEL_TYPE_TO_DOC_MAPPING[m] if m in MODEL_TYPE_TO_DOC_MAPPING else m for m in model_types]
 
     errors = []
     for m in model_docs:

From c6a49e91f0c93676444bf9d27309530b99dfa228 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Feb 2022 17:24:58 +0000
Subject: [PATCH 104/121] remove useless test file

---
 tests/test_modeling_data2vec.py | 1225 -------------------------------
 1 file changed, 1225 deletions(-)
 delete mode 100644 tests/test_modeling_data2vec.py

diff --git a/tests/test_modeling_data2vec.py b/tests/test_modeling_data2vec.py
deleted file mode 100644
index 01dda64c5a1b1..0000000000000
--- a/tests/test_modeling_data2vec.py
+++ /dev/null
@@ -1,1225 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Data2VecAudio model. """
-
-import math
-import unittest
-
-import numpy as np
-from datasets import load_dataset
-
-from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
-from transformers import Data2VecAudioConfig, Data2VecTextConfig, is_torch_available
-from transformers.testing_utils import (
-    TestCasePlus,
-    is_pt_flax_cross_test,
-    require_soundfile,
-    require_torch,
-    slow,
-    torch_device,
-)
-
-from .test_configuration_common import ConfigTester
-from .test_generation_utils import GenerationTesterMixin
-from .test_modeling_common import ModelTesterMixin, _config_zero_init
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        Data2VecAudioForAudioFrameClassification,
-        Data2VecAudioForCTC,
-        Data2VecAudioForSequenceClassification,
-        Data2VecAudioForXVector,
-        Data2VecAudioModel,
-        Data2VecTextForCausalLM,
-        Data2VecTextForMaskedLM,
-        Data2VecTextForMultipleChoice,
-        Data2VecTextForQuestionAnswering,
-        Data2VecTextForSequenceClassification,
-        Data2VecTextForTokenClassification,
-        Data2VecTextModel,
-        Wav2Vec2Processor,
-    )
-    from transformers.models.data2vec.modeling_data2vec_audio import _compute_mask_indices
-    from transformers.models.data2vec.modeling_data2vec_text import (
-        DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        Data2VecTextForTextEmbeddings,
-        create_position_ids_from_input_ids,
-    )
-
-
-class Data2VecTextModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 5
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return Data2VecTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecTextModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Data2VecTextModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Data2VecTextForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Data2VecTextForCausalLM(config=config).to(torch_device).eval()
-
-        # make sure that ids don't start with pad token
-        mask = input_ids.ne(config.pad_token_id).long()
-        input_ids = input_ids * mask
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-
-        # make sure that ids don't start with pad token
-        mask = next_tokens.ne(config.pad_token_id).long()
-        next_tokens = next_tokens * mask
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecTextForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = Data2VecTextForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = Data2VecTextForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Data2VecTextForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class Data2VecTextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            Data2VecTextForCausalLM,
-            Data2VecTextForMaskedLM,
-            Data2VecTextModel,
-            Data2VecTextForSequenceClassification,
-            Data2VecTextForTokenClassification,
-            Data2VecTextForMultipleChoice,
-            Data2VecTextForQuestionAnswering,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (Data2VecTextForCausalLM,) if is_torch_available() else ()
-
-    def setUp(self):
-        self.model_tester = Data2VecTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Data2VecTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = Data2VecTextModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    def test_create_position_ids_respects_padding_index(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = Data2VecTextForTextEmbeddings(config=config)
-
-        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = torch.as_tensor(
-            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
-        )
-
-        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-    def test_create_position_ids_from_inputs_embeds(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = Data2VecTextForTextEmbeddings(config=config)
-
-        inputs_embeds = torch.empty(2, 4, 30)
-        expected_single_positions = [
-            0 + embeddings.padding_idx + 1,
-            1 + embeddings.padding_idx + 1,
-            2 + embeddings.padding_idx + 1,
-            3 + embeddings.padding_idx + 1,
-        ]
-        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
-        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-
-@require_torch
-class Data2VecTextModelIntegrationTest(TestCasePlus):
-    @slow
-    def test_inference_masked_lm(self):
-        model = Data2VecTextForMaskedLM.from_pretrained("facebook/data2vec-text-base")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor([[[0.2328, 0.0000, 1.1710], [2.2525, 0.0000, 1.9937], [2.1280, 0.0000, 1.8691]]])
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_no_head(self):
-        model = Data2VecTextModel.from_pretrained("facebook/data2vec-text-base")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[0.1998, -0.0379, 0.0024], [-0.0971, -0.2214, -0.1798], [-0.0789, -0.2400, -0.1898]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-
-class Data2VecAudioModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=1024,  # speech is longer
-        is_training=False,
-        hidden_size=16,
-        feat_extract_dropout=0.0,
-        feat_extract_activation="gelu",
-        conv_dim=(32, 32, 32),
-        conv_stride=(4, 4, 4),
-        conv_kernel=(8, 8, 8),
-        conv_bias=False,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
-        num_attention_heads=2,
-        hidden_dropout_prob=0.1,
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        mask_time_prob=0.5,
-        mask_time_length=2,
-        vocab_size=32,
-        num_adapter_layers=1,
-        adapter_stride=2,
-        tdnn_dim=(32, 32),
-        tdnn_kernel=(5, 3),
-        tdnn_dilation=(1, 2),
-        xvector_output_dim=32,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.num_adapter_layers = num_adapter_layers
-        self.adapter_stride = adapter_stride
-        self.mask_time_prob = mask_time_prob
-        self.mask_time_length = mask_time_length
-        self.scope = scope
-        self.tdnn_dim = tdnn_dim
-        self.tdnn_kernel = tdnn_kernel
-        self.tdnn_dilation = tdnn_dilation
-        self.xvector_output_dim = xvector_output_dim
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-        self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        return config, input_values, attention_mask
-
-    def get_config(self):
-        return Data2VecAudioConfig(
-            hidden_size=self.hidden_size,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            mask_time_prob=self.mask_time_prob,
-            mask_time_length=self.mask_time_length,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            num_adapter_layers=self.num_adapter_layers,
-            adapter_stride=self.adapter_stride,
-            tdnn_dim=self.tdnn_dim,
-            tdnn_kernel=self.tdnn_kernel,
-            tdnn_dilation=self.tdnn_dilation,
-            xvector_output_dim=self.xvector_output_dim,
-        )
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 8
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
-        )
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = Data2VecAudioForCTC(config=config)
-        model.to(torch_device)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_seq_classifier_loss(self, config, input_values, *args):
-        model = Data2VecAudioForSequenceClassification(config=config)
-        model.to(torch_device)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-        unmasked_loss = model(input_values, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(masked_loss, float))
-        self.parent.assertTrue(isinstance(unmasked_loss, float))
-        self.parent.assertTrue(masked_loss != unmasked_loss)
-
-    def check_ctc_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Data2VecAudioForCTC(config=config)
-        model.to(torch_device)
-        model.train()
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_seq_classifier_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Data2VecAudioForSequenceClassification(config=config)
-        model.to(torch_device)
-        model.train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_xvector_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Data2VecAudioForXVector(config=config)
-        model.to(torch_device)
-        model.train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = Data2VecAudioForCTC(config)
-        model.to(torch_device)
-        model.train()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)
-
-        with self.parent.assertRaises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class Data2VecAudioModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            Data2VecAudioForCTC,
-            Data2VecAudioModel,
-            Data2VecAudioForSequenceClassification,
-            Data2VecAudioForAudioFrameClassification,
-            Data2VecAudioForXVector,
-        )
-        if is_torch_available()
-        else ()
-    )
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = Data2VecAudioModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
-
-    def test_model_with_adapter_proj_dim(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_xvector_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_xvector_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # Data2VecAudio has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_values`
-    def test_forward_signature(self):
-        pass
-
-    # Data2VecAudio cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # Data2VecAudio has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
-    def test_model_common_attributes(self):
-        pass
-
-    @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_pt_to_flax(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_values = inputs_dict["input_values"]
-
-        input_lengths = torch.tensor(
-            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    def test_mask_feature_prob_ctc(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "facebook/data2vec-audio-base-960h", mask_feature_prob=0.2, mask_feature_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 299, 32))
-
-    def test_mask_time_prob_ctc(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "facebook/data2vec-audio-base-960h", mask_time_prob=0.2, mask_time_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 299, 32))
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = Data2VecAudioModel.from_pretrained("facebook/data2vec-audio-base")
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class Data2VecAudioUtilsTest(unittest.TestCase):
-    def test_compute_mask_indices(self):
-        batch_size = 4
-        sequence_length = 60
-        mask_prob = 0.5
-        mask_length = 1
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = torch.from_numpy(mask).to(torch_device)
-
-        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
-
-    def test_compute_mask_indices_low_prob(self):
-        # with these settings num_masked_spans=0.5, which means probabilistic rounding
-        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
-        # the other 5 out of 10, cases num_masked_spans=1
-        n_trials = 100
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        count_dimensions_masked = 0
-        count_dimensions_not_masked = 0
-
-        for _ in range(n_trials):
-            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-            mask = torch.from_numpy(mask).to(torch_device)
-
-            num_masks = torch.sum(mask).item()
-
-            if num_masks > 0:
-                count_dimensions_masked += 1
-            else:
-                count_dimensions_not_masked += 1
-
-        # as we test for at least 10 masked dimension and at least
-        # 10 non-masked dimension, this test could fail with probability:
-        # P(100 coin flips, at most 9 heads) = 1.66e-18
-        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
-        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
-
-    def test_compute_mask_indices_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = torch.from_numpy(mask).to(torch_device)
-
-        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-    def test_compute_mask_indices_attn_mask_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
-        attention_mask[:2, sequence_length // 2 :] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
-        )
-        mask = torch.from_numpy(mask).to(torch_device)
-
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
-
-    def test_compute_mask_indices_short_audio(self):
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
-        # force one example to be heavily padded
-        attention_mask[0, 5:] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
-        )
-
-        # make sure that non-padded examples cannot be padded
-        self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any())
-
-
-@require_torch
-@require_soundfile
-@slow
-class Data2VecAudioModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
-
-        return ds[:num_samples]
-
-    def test_inference_ctc_normal(self):
-        model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h")
-        model.to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
-        input_speech = self._load_datasamples(1)
-
-        input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_batched(self):
-        model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h").to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with thousands of spectators were trivialities not worth thinking about",
-            "his instant of panic was followed by a small sharp blow high on his chest",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)

From b9266612fbdc61796701e4c59e5baf10e9242d44 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 24 Feb 2022 00:01:46 +0100
Subject: [PATCH 105/121] Update tests/test_modeling_common.py

---
 tests/test_modeling_common.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 6e2c9d1d679c5..889cc95c37429 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1914,7 +1914,6 @@ def test_load_with_mismatched_shapes(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
-            print(model_class)
             if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
                 continue
 

From 432e42dbeb679c6b082fae3db6cf2a11013265ef Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Wed, 23 Feb 2022 15:24:10 -0800
Subject: [PATCH 106/121] Update utils/check_repo.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 utils/check_repo.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/check_repo.py b/utils/check_repo.py
index b9186a0c64dba..9d21dd1072f3b 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -169,7 +169,6 @@
     "VisualBertForMultipleChoice",
     "TFWav2Vec2ForCTC",
     "TFHubertForCTC",
-    "Data2VecAudioModel",  # TODO: Fix
 ]
 
 # Update this list for models that have multiple model types for the same

From a3ce0254342801fa564da5a9c6c73480ce64e6e2 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <edugp91@gmail.com>
Date: Wed, 23 Feb 2022 15:24:54 -0800
Subject: [PATCH 107/121] Update
 src/transformers/models/data2vec/modeling_data2vec_text.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/data2vec/modeling_data2vec_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index ecb867d637617..6e8cd7d82078d 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -961,7 +961,7 @@ def forward(
         >>> from transformers import Data2VecTextTokenizer, Data2VecTextForCausalLM, Data2VecTextConfig
         >>> import torch
 
-        >>> tokenizer = Data2VecTextTokenizer.from_pretrained("data2vec-base")
+        >>> tokenizer = Data2VecTextTokenizer.from_pretrained("facebook/data2vec-text-base")
         >>> config = Data2VecTextConfig.from_pretrained("data2vec-base")
         >>> config.is_decoder = True
         >>> model = Data2VecTextForCausalLM.from_pretrained("data2vec-base", config=config)

From 0780b0385d30c7a8503c4085a140280854526bd1 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Wed, 23 Feb 2022 20:53:32 -0500
Subject: [PATCH 108/121] Move data2vec tests to new structure

---
 tests/data2vec/__init__.py                           | 0
 tests/{ => data2vec}/test_modeling_data2vec_audio.py | 4 ++--
 tests/{ => data2vec}/test_modeling_data2vec_text.py  | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)
 create mode 100644 tests/data2vec/__init__.py
 rename tests/{ => data2vec}/test_modeling_data2vec_audio.py (99%)
 rename tests/{ => data2vec}/test_modeling_data2vec_text.py (99%)

diff --git a/tests/data2vec/__init__.py b/tests/data2vec/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/test_modeling_data2vec_audio.py b/tests/data2vec/test_modeling_data2vec_audio.py
similarity index 99%
rename from tests/test_modeling_data2vec_audio.py
rename to tests/data2vec/test_modeling_data2vec_audio.py
index 4797d5365bc21..e5b8fd0e3a9db 100644
--- a/tests/test_modeling_data2vec_audio.py
+++ b/tests/data2vec/test_modeling_data2vec_audio.py
@@ -24,8 +24,8 @@
 from transformers import Data2VecAudioConfig, is_torch_available
 from transformers.testing_utils import is_pt_flax_cross_test, require_soundfile, require_torch, slow, torch_device
 
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, _config_zero_init
+from ..test_configuration_common import ConfigTester
+from ..test_modeling_common import ModelTesterMixin, _config_zero_init
 
 
 if is_torch_available():
diff --git a/tests/test_modeling_data2vec_text.py b/tests/data2vec/test_modeling_data2vec_text.py
similarity index 99%
rename from tests/test_modeling_data2vec_text.py
rename to tests/data2vec/test_modeling_data2vec_text.py
index 45dba500601b4..cb32578ed2f16 100644
--- a/tests/test_modeling_data2vec_text.py
+++ b/tests/data2vec/test_modeling_data2vec_text.py
@@ -20,9 +20,9 @@
 from transformers import Data2VecTextConfig, is_torch_available
 from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
 
-from .test_configuration_common import ConfigTester
-from .test_generation_utils import GenerationTesterMixin
-from .test_modeling_common import ModelTesterMixin
+from ..test_configuration_common import ConfigTester
+from ..generation.test_generation_utils import GenerationTesterMixin
+from ..test_modeling_common import ModelTesterMixin
 
 
 if is_torch_available():

From de7f64974b24c251fbf6890830ea61a7c3dae161 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Wed, 23 Feb 2022 21:03:49 -0500
Subject: [PATCH 109/121] Fix test imports for text tests

---
 tests/data2vec/test_modeling_data2vec_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/data2vec/test_modeling_data2vec_text.py b/tests/data2vec/test_modeling_data2vec_text.py
index cb32578ed2f16..8b27cefb74bdb 100644
--- a/tests/data2vec/test_modeling_data2vec_text.py
+++ b/tests/data2vec/test_modeling_data2vec_text.py
@@ -20,8 +20,8 @@
 from transformers import Data2VecTextConfig, is_torch_available
 from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
 
-from ..test_configuration_common import ConfigTester
 from ..generation.test_generation_utils import GenerationTesterMixin
+from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import ModelTesterMixin
 
 

From f095e35aa5b262d1314b8a578c3bf4f49119a21f Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Wed, 23 Feb 2022 21:08:19 -0500
Subject: [PATCH 110/121] Remove fairseq files

---
 .../models/data2vec/data2vec_audio.py         | 480 ------------------
 .../models/data2vec/data2vec_text.py          | 474 -----------------
 .../models/data2vec/ema_module.py             | 133 -----
 3 files changed, 1087 deletions(-)
 delete mode 100644 src/transformers/models/data2vec/data2vec_audio.py
 delete mode 100644 src/transformers/models/data2vec/data2vec_text.py
 delete mode 100644 src/transformers/models/data2vec/ema_module.py

diff --git a/src/transformers/models/data2vec/data2vec_audio.py b/src/transformers/models/data2vec/data2vec_audio.py
deleted file mode 100644
index 1c290d209d31e..0000000000000
--- a/src/transformers/models/data2vec/data2vec_audio.py
+++ /dev/null
@@ -1,480 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# TODO: Remove file
-
-import logging
-import math
-from dataclasses import dataclass, field
-from typing import Optional
-
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq.data.data_utils import compute_mask_indices
-from fairseq.models import BaseFairseqModel, register_model
-from fairseq.models.wav2vec import ConvFeatureExtractionModel, TransformerEncoder, Wav2Vec2Config
-from fairseq.modules import GradMultiply, LayerNorm
-from fairseq.utils import index_put
-
-from omegaconf import II
-
-# Copied from https://github.com/pytorch/fairseq/blob/327cff24a57c2ae06657731bf3be86ee88fccfea/fairseq/modules/ema_module.py
-from .ema_module import EMAModule, EMAModuleConfig
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class Data2VecAudioConfig(Wav2Vec2Config):
-
-    loss_beta: float = field(default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"})
-    loss_scale: Optional[float] = field(
-        default=None,
-        metadata={"help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"},
-    )
-    average_top_k_layers: int = field(default=8, metadata={"help": "how many layers to average"})
-
-    layer_norm_target_layer: bool = False
-    instance_norm_target_layer: bool = False
-    instance_norm_targets: bool = False
-    layer_norm_targets: bool = False
-    batch_norm_target_layer: bool = False
-    group_norm_target_layer: bool = False
-
-    ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"})
-    ema_end_decay: float = field(default=0.9999, metadata={"help": "final ema decay rate"})
-
-    # when to finish annealing ema decay rate
-    ema_anneal_end_step: int = II("optimization.max_update")
-
-    ema_transformer_only: bool = field(
-        default=True,
-        metadata={"help": "whether to momentum update only the transformer"},
-    )
-    ema_layers_only: bool = field(
-        default=True,
-        metadata={"help": "whether to momentum update only the transformer layers"},
-    )
-
-    max_update: int = II("optimization.max_update")
-
-    min_target_var: float = field(default=0.1, metadata={"help": "stop training if target var falls below this"})
-    min_pred_var: float = field(
-        default=0.01,
-        metadata={"help": "stop training if prediction var falls below this"},
-    )
-
-
-def get_annealed_rate(start, end, curr_step, total_steps):
-    r = end - start
-    pct_remaining = 1 - curr_step / total_steps
-    return end - r * pct_remaining
-
-
-@register_model("data2vec_audio", dataclass=Data2VecAudioConfig)
-class Data2VecAudioModel(BaseFairseqModel):
-    def __init__(self, cfg: Data2VecAudioConfig):
-        super().__init__()
-        self.cfg = cfg
-
-        feature_enc_layers = eval(cfg.conv_feature_layers)
-        self.extractor_embed = feature_enc_layers[-1][0]
-
-        self.ema = None
-        self.embed = cfg.encoder_embed_dim
-
-        self.average_top_k_layers = cfg.average_top_k_layers
-        self.loss_beta = cfg.loss_beta
-        self.loss_scale = cfg.loss_scale
-
-        self.feature_extractor = ConvFeatureExtractionModel(
-            conv_layers=feature_enc_layers,
-            dropout=0.0,
-            mode=cfg.extractor_mode,
-            conv_bias=cfg.conv_bias,
-        )
-
-        self.post_extract_proj = nn.Linear(self.extractor_embed, cfg.encoder_embed_dim)
-
-        self.mask_prob = cfg.mask_prob
-        self.mask_selection = cfg.mask_selection
-        self.mask_other = cfg.mask_other
-        self.mask_length = cfg.mask_length
-        self.no_mask_overlap = cfg.no_mask_overlap
-        self.mask_min_space = cfg.mask_min_space
-
-        self.mask_channel_prob = cfg.mask_channel_prob
-        self.mask_channel_before = cfg.mask_channel_before
-        self.mask_channel_selection = cfg.mask_channel_selection
-        self.mask_channel_other = cfg.mask_channel_other
-        self.mask_channel_length = cfg.mask_channel_length
-        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
-        self.mask_channel_min_space = cfg.mask_channel_min_space
-
-        self.dropout_input = nn.Dropout(cfg.dropout_input)
-        self.dropout_features = nn.Dropout(cfg.dropout_features)
-
-        self.feature_grad_mult = cfg.feature_grad_mult
-
-        self.mask_emb = nn.Parameter(torch.FloatTensor(cfg.encoder_embed_dim).uniform_())
-
-        self.encoder = TransformerEncoder(cfg)
-        self.layer_norm = LayerNorm(self.extractor_embed)
-
-        self.final_proj = nn.Linear(self.embed, self.embed)
-
-        self.num_updates = 0
-
-    def make_ema_teacher(self):
-        ema_config = EMAModuleConfig(
-            ema_decay=self.cfg.ema_decay,
-            ema_fp32=True,
-        )
-        skip_keys = set()
-        if self.cfg.ema_layers_only:
-            self.cfg.ema_transformer_only = True
-            for k, _ in self.encoder.pos_conv.named_parameters():
-                skip_keys.add(f"pos_conv.{k}")
-
-        self.ema = EMAModule(
-            self.encoder if self.cfg.ema_transformer_only else self,
-            ema_config,
-            skip_keys=skip_keys,
-        )
-
-    def set_num_updates(self, num_updates):
-        super().set_num_updates(num_updates)
-
-        if self.ema is None and self.final_proj is not None:
-            logger.info("making ema teacher")
-            self.make_ema_teacher()
-        elif self.training and self.ema is not None:
-            if self.cfg.ema_decay != self.cfg.ema_end_decay:
-                if num_updates >= self.cfg.ema_anneal_end_step:
-                    decay = self.cfg.ema_end_decay
-                else:
-                    decay = get_annealed_rate(
-                        self.cfg.ema_decay,
-                        self.cfg.ema_end_decay,
-                        num_updates,
-                        self.cfg.ema_anneal_end_step,
-                    )
-                self.ema.set_decay(decay)
-            if self.ema.get_decay() < 1:
-                self.ema.step(self.encoder if self.cfg.ema_transformer_only else self)
-
-        self.num_updates = num_updates
-
-    def state_dict(self, destination=None, prefix="", keep_vars=False):
-        state = super().state_dict(destination, prefix, keep_vars)
-
-        if self.ema is not None:
-            state[prefix + "_ema"] = self.ema.fp32_params
-
-        return state
-
-    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
-        if self.ema is not None:
-            k = prefix + "_ema"
-            assert k in state_dict
-            self.ema.restore(state_dict[k], True)
-            del state_dict[k]
-        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
-
-    @classmethod
-    def build_model(cls, cfg: Data2VecAudioConfig, task=None):
-        """Build a new model instance."""
-
-        return cls(cfg)
-
-    def apply_mask(
-        self,
-        x,
-        padding_mask,
-        mask_indices=None,
-        mask_channel_indices=None,
-    ):
-        B, T, C = x.shape
-
-        if self.mask_channel_prob > 0 and self.mask_channel_before:
-            mask_channel_indices = compute_mask_indices(
-                (B, C),
-                None,
-                self.mask_channel_prob,
-                self.mask_channel_length,
-                self.mask_channel_selection,
-                self.mask_channel_other,
-                no_overlap=self.no_mask_channel_overlap,
-                min_space=self.mask_channel_min_space,
-            )
-            mask_channel_indices = torch.from_numpy(mask_channel_indices).to(x.device).unsqueeze(1).expand(-1, T, -1)
-            x[mask_channel_indices] = 0
-
-        if self.mask_prob > 0:
-            if mask_indices is None:
-                mask_indices = compute_mask_indices(
-                    (B, T),
-                    padding_mask,
-                    self.mask_prob,
-                    self.mask_length,
-                    self.mask_selection,
-                    self.mask_other,
-                    min_masks=1,
-                    no_overlap=self.no_mask_overlap,
-                    min_space=self.mask_min_space,
-                    require_same_masks=self.cfg.require_same_masks,
-                    mask_dropout=self.cfg.mask_dropout,
-                )
-                mask_indices = torch.from_numpy(mask_indices).to(x.device)
-            x = index_put(x, mask_indices, self.mask_emb)
-        else:
-            mask_indices = None
-
-        if self.mask_channel_prob > 0 and not self.mask_channel_before:
-            if mask_channel_indices is None:
-                mask_channel_indices = compute_mask_indices(
-                    (B, C),
-                    None,
-                    self.mask_channel_prob,
-                    self.mask_channel_length,
-                    self.mask_channel_selection,
-                    self.mask_channel_other,
-                    no_overlap=self.no_mask_channel_overlap,
-                    min_space=self.mask_channel_min_space,
-                )
-                mask_channel_indices = (
-                    torch.from_numpy(mask_channel_indices).to(x.device).unsqueeze(1).expand(-1, T, -1)
-                )
-            x = index_put(x, mask_channel_indices, 0)
-
-        return x, mask_indices
-
-    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
-        """
-        Computes the output length of the convolutional layers
-        """
-
-        def _conv_out_length(input_length, kernel_size, stride):
-            return torch.floor((input_length - kernel_size) / stride + 1)
-
-        conv_cfg_list = eval(self.cfg.conv_feature_layers)
-
-        for i in range(len(conv_cfg_list)):
-            input_lengths = _conv_out_length(input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2])
-
-        return input_lengths.to(torch.long)
-
-    def forward(
-        self,
-        source,
-        padding_mask=None,
-        mask=True,
-        features_only=False,
-        layer=None,
-        mask_indices=None,
-        mask_channel_indices=None,
-        padding_count=None,
-    ):
-        features = source
-
-        if self.feature_grad_mult > 0:
-            features = self.feature_extractor(features)
-            if self.feature_grad_mult != 1.0:
-                features = GradMultiply.apply(features, self.feature_grad_mult)
-        else:
-            with torch.no_grad():
-                features = self.feature_extractor(features)
-
-        features = features.transpose(1, 2)
-
-        features = self.layer_norm(features)
-
-        orig_padding_mask = padding_mask
-
-        if padding_mask is not None and padding_mask.any():
-            input_lengths = (1 - padding_mask.long()).sum(-1)
-            # apply conv formula to get real output_lengths
-            output_lengths = self._get_feat_extract_output_lengths(input_lengths)
-
-            padding_mask = torch.zeros(features.shape[:2], dtype=features.dtype, device=features.device)
-
-            # these two operations makes sure that all values
-            # before the output lengths indices are attended to
-            padding_mask[
-                (
-                    torch.arange(padding_mask.shape[0], device=padding_mask.device),
-                    output_lengths - 1,
-                )
-            ] = 1
-            padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool()
-        else:
-            padding_mask = None
-
-        if self.post_extract_proj is not None:
-            features = self.post_extract_proj(features)
-
-        pre_encoder_features = None
-        if self.cfg.ema_transformer_only:
-            pre_encoder_features = features.clone()
-
-        features = self.dropout_input(features)
-
-        if mask:
-            x, mask_indices = self.apply_mask(
-                features,
-                padding_mask,
-                mask_indices=mask_indices,
-                mask_channel_indices=mask_channel_indices,
-            )
-        else:
-            x = features
-            mask_indices = None
-        x, layer_results = self.encoder(
-            x,
-            padding_mask=padding_mask,
-            layer=layer,
-        )
-
-        if features_only:
-            return {
-                "x": x,
-                "padding_mask": padding_mask,
-                "layer_results": layer_results,
-            }
-
-        result = {
-            "losses": {},
-        }
-
-        with torch.no_grad():
-            self.ema.model.eval()
-
-            if self.cfg.ema_transformer_only:
-                y, layer_results = self.ema.model.extract_features(
-                    pre_encoder_features,
-                    padding_mask=padding_mask,
-                    min_layer=self.cfg.encoder_layers - self.average_top_k_layers,
-                )
-                y = {
-                    "x": y,
-                    "padding_mask": padding_mask,
-                    "layer_results": layer_results,
-                }
-            else:
-                y = self.ema.model.extract_features(
-                    source=source,
-                    padding_mask=orig_padding_mask,
-                    mask=False,
-                )
-
-            target_layer_results = [l[2] for l in y["layer_results"]]
-
-            permuted = False
-            if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer:
-                target_layer_results = [tl.permute(1, 2, 0) for tl in target_layer_results]  # TBC -> BCT
-                permuted = True
-
-            if self.cfg.batch_norm_target_layer:
-                target_layer_results = [
-                    F.batch_norm(tl.float(), running_mean=None, running_var=None, training=True)
-                    for tl in target_layer_results
-                ]
-
-            if self.cfg.instance_norm_target_layer:
-                target_layer_results = [F.instance_norm(tl.float()) for tl in target_layer_results]
-
-            if permuted:
-                target_layer_results = [tl.transpose(1, 2) for tl in target_layer_results]  # BCT -> BTC
-
-            if self.cfg.group_norm_target_layer:
-                target_layer_results = [F.layer_norm(tl.float(), tl.shape[-2:]) for tl in target_layer_results]
-
-            if self.cfg.layer_norm_target_layer:
-                target_layer_results = [F.layer_norm(tl.float(), tl.shape[-1:]) for tl in target_layer_results]
-
-            y = sum(target_layer_results) / len(target_layer_results)
-
-            if self.cfg.layer_norm_targets:
-                y = F.layer_norm(y.float(), y.shape[-1:])
-
-            if self.cfg.instance_norm_targets:
-                y = F.instance_norm(y.float().transpose(1, 2)).transpose(1, 2)
-
-            if not permuted:
-                y = y.transpose(0, 1)
-
-            y = y[mask_indices]
-
-        x = x[mask_indices]
-        x = self.final_proj(x)
-
-        sz = x.size(-1)
-
-        if self.loss_beta == 0:
-            loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1)
-        else:
-            loss = F.smooth_l1_loss(x.float(), y.float(), reduction="none", beta=self.loss_beta).sum(dim=-1)
-
-        if self.loss_scale is not None:
-            scale = self.loss_scale
-        else:
-            scale = 1 / math.sqrt(sz)
-
-        result["losses"]["regression"] = loss.sum() * scale
-
-        if "sample_size" not in result:
-            result["sample_size"] = loss.numel()
-
-        with torch.no_grad():
-            result["target_var"] = self.compute_var(y)
-            result["pred_var"] = self.compute_var(x.float())
-
-        if self.num_updates > 5000 and result["target_var"] < self.cfg.min_target_var:
-            logger.error(f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting")
-            raise Exception(f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting")
-        if self.num_updates > 5000 and result["pred_var"] < self.cfg.min_pred_var:
-            logger.error(f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting")
-            raise Exception(f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting")
-
-        if self.ema is not None:
-            result["ema_decay"] = self.ema.get_decay() * 1000
-
-        return result
-
-    @staticmethod
-    def compute_var(y):
-        y = y.view(-1, y.size(-1))
-        if dist.is_initialized():
-            zc = torch.tensor(y.size(0)).cuda()
-            zs = y.sum(dim=0)
-            zss = (y**2).sum(dim=0)
-
-            dist.all_reduce(zc)
-            dist.all_reduce(zs)
-            dist.all_reduce(zss)
-
-            var = zss / (zc - 1) - (zs**2) / (zc * (zc - 1))
-            return torch.sqrt(var + 1e-6).mean()
-        else:
-            return torch.sqrt(y.var(dim=0) + 1e-6).mean()
-
-    def extract_features(self, source, padding_mask, mask=False, layer=None):
-        res = self.forward(
-            source,
-            padding_mask,
-            mask=mask,
-            features_only=True,
-            layer=layer,
-        )
-        return res
-
-    def remove_pretraining_modules(self, last_layer=None):
-        self.final_proj = None
-        self.ema = None
-        if last_layer is not None:
-            self.encoder.layers = nn.ModuleList(l for i, l in enumerate(self.encoder.layers) if i <= last_layer)
diff --git a/src/transformers/models/data2vec/data2vec_text.py b/src/transformers/models/data2vec/data2vec_text.py
deleted file mode 100644
index b55dbddd733dd..0000000000000
--- a/src/transformers/models/data2vec/data2vec_text.py
+++ /dev/null
@@ -1,474 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# TODO: Remove file
-
-import logging
-import math
-from dataclasses import dataclass, field
-from typing import Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq.dataclass import FairseqDataclass
-from fairseq.dataclass.configs import EMAConfig
-from fairseq.models import FairseqEncoder, FairseqEncoderModel, register_model
-from fairseq.models.ema import EMA
-from fairseq.models.roberta.model import RobertaClassificationHead, RobertaLMHead
-from fairseq.models.transformer import TransformerConfig, TransformerEncoder
-from fairseq.modules.transformer_sentence_encoder import init_bert_params
-
-from omegaconf import II
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class Data2VecTextConfig(FairseqDataclass):
-    max_positions: int = II("task.tokens_per_sample")
-
-    head_layers: int = 1
-
-    transformer: TransformerConfig = TransformerConfig()
-
-    load_checkpoint_heads: bool = field(
-        default=False,
-        metadata={"help": "(re-)register and load heads when loading checkpoints"},
-    )
-
-    loss_beta: float = field(default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"})
-    loss_scale: Optional[float] = field(
-        default=None,
-        metadata={"help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"},
-    )
-    average_top_k_layers: int = field(default=8, metadata={"help": "how many layers to average"})
-
-    layer_norm_target_layer: bool = False
-    instance_norm_target_layer: bool = False
-    batch_norm_target_layer: bool = False
-    instance_norm_targets: bool = False
-    layer_norm_targets: bool = False
-
-    ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"})
-    ema_end_decay: float = field(default=0.9999, metadata={"help": "final ema decay rate"})
-
-    # when to finish annealing ema decay rate
-    ema_anneal_end_step: int = II("optimization.max_update")
-
-    ema_transformer_layers_only: bool = field(
-        default=True,
-        metadata={"help": "whether to momentum update only the transformer layers"},
-    )
-
-
-def get_annealed_rate(start, end, curr_step, total_steps):
-    r = end - start
-    pct_remaining = 1 - curr_step / total_steps
-    return end - r * pct_remaining
-
-
-@register_model("data2vec_text", dataclass=Data2VecTextConfig)
-class Data2VecTextModel(FairseqEncoderModel):
-    def __init__(self, cfg: Data2VecTextConfig, encoder):
-        super().__init__(encoder)
-        self.cfg = cfg
-
-        # We follow BERT's random weight initialization
-        self.apply(init_bert_params)
-
-        self.classification_heads = nn.ModuleDict()
-
-    @classmethod
-    def build_model(cls, cfg, task):
-        """Build a new model instance."""
-
-        encoder = Data2VecTextEncoder(cfg, task.source_dictionary, task.cfg.data)
-
-        return cls(cfg, encoder)
-
-    def forward(
-        self,
-        src_tokens,
-        target_tokens=None,
-        features_only=False,
-        return_all_hiddens=False,
-        classification_head_name=None,
-        **kwargs,
-    ):
-        if classification_head_name is not None:
-            features_only = True
-
-        res = self.encoder(src_tokens, target_tokens, features_only, return_all_hiddens, **kwargs)
-
-        if isinstance(res, tuple):
-            x, extra = res
-        else:
-            return res
-
-        if classification_head_name is not None:
-            x = self.classification_heads[classification_head_name](x)
-        return x, extra
-
-    def get_normalized_probs(self, net_output, log_probs, sample=None):
-        """Get normalized probabilities (or log probs) from a net's output."""
-        logits = net_output[0].float()
-        if log_probs:
-            return F.log_softmax(logits, dim=-1)
-        else:
-            return F.softmax(logits, dim=-1)
-
-    def register_classification_head(self, name, num_classes=None, inner_dim=None, **kwargs):
-        """Register a classification head."""
-        if name in self.classification_heads:
-            prev_num_classes = self.classification_heads[name].out_proj.out_features
-            prev_inner_dim = self.classification_heads[name].dense.out_features
-            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
-                logger.warning(
-                    're-registering head "{}" with num_classes {} (prev: {}) '
-                    "and inner_dim {} (prev: {})".format(
-                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
-                    )
-                )
-        self.classification_heads[name] = RobertaClassificationHead(
-            input_dim=self.cfg.transformer.encoder.embed_dim,
-            inner_dim=inner_dim or self.cfg.transformer.encoder.embed_dim,
-            num_classes=num_classes,
-            activation_fn="tanh",
-            pooler_dropout=0,
-        )
-
-    @property
-    def supported_targets(self):
-        return {"self"}
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        prefix = name + "." if name != "" else ""
-
-        # rename decoder -> encoder before upgrading children modules
-        for k in list(state_dict.keys()):
-            if k.startswith(prefix + "decoder"):
-                new_k = prefix + "encoder" + k[len(prefix + "decoder") :]
-                state_dict[new_k] = state_dict[k]
-                del state_dict[k]
-
-        # rename emb_layer_norm -> layernorm_embedding
-        for k in list(state_dict.keys()):
-            if ".emb_layer_norm." in k:
-                new_k = k.replace(".emb_layer_norm.", ".layernorm_embedding.")
-                state_dict[new_k] = state_dict[k]
-                del state_dict[k]
-
-            if self.encoder.regression_head is not None:
-                if ".lm_head." in k:
-                    new_k = k.replace(".lm_head.", ".regression_head.")
-                    state_dict[new_k] = state_dict[k]
-                    del state_dict[k]
-            else:
-                if ".regression_head." in k:
-                    del state_dict[k]
-
-        # upgrade children modules
-        super().upgrade_state_dict_named(state_dict, name)
-
-        # Handle new classification heads present in the state dict.
-        current_head_names = (
-            []
-            if not hasattr(self, "classification_heads") or self.classification_heads is None
-            else self.classification_heads.keys()
-        )
-        keys_to_delete = []
-        for k in state_dict.keys():
-            if not k.startswith(prefix + "classification_heads."):
-                continue
-
-            head_name = k[len(prefix + "classification_heads.") :].split(".")[0]
-            num_classes = state_dict[prefix + "classification_heads." + head_name + ".out_proj.weight"].size(0)
-            inner_dim = state_dict[prefix + "classification_heads." + head_name + ".dense.weight"].size(0)
-
-            if self.cfg.load_checkpoint_heads:
-                if head_name not in current_head_names:
-                    self.register_classification_head(head_name, num_classes, inner_dim)
-            else:
-                if head_name not in current_head_names:
-                    logger.warning(
-                        "deleting classification head ({}) from checkpoint "
-                        "not present in current model: {}".format(head_name, k)
-                    )
-                    keys_to_delete.append(k)
-                elif (
-                    num_classes != self.classification_heads[head_name].out_proj.out_features
-                    or inner_dim != self.classification_heads[head_name].dense.out_features
-                ):
-                    logger.warning(
-                        "deleting classification head ({}) from checkpoint "
-                        "with different dimensions than current model: {}".format(head_name, k)
-                    )
-                    keys_to_delete.append(k)
-        for k in keys_to_delete:
-            del state_dict[k]
-
-        # Copy any newly-added classification heads into the state dict
-        # with their current weights.
-        if (
-            hasattr(self, "classification_heads")
-            and self.classification_heads is not None
-            and len(self.classification_heads) > 0
-        ):
-            cur_state = self.classification_heads.state_dict()
-            for k, v in cur_state.items():
-                if prefix + "classification_heads." + k not in state_dict:
-                    logger.info("Overwriting " + prefix + "classification_heads." + k)
-                    state_dict[prefix + "classification_heads." + k] = v
-
-            for k in list(state_dict.keys()):
-                if k.startswith(prefix + "encoder.lm_head.") or k.startswith(prefix + "encoder.emb_head."):
-                    del state_dict[k]
-
-            self.encoder.lm_head = None
-
-        if self.encoder.target_model is None:
-            for k in list(state_dict.keys()):
-                if k.startswith(prefix + "encoder.target_model."):
-                    del state_dict[k]
-
-        if (self.encoder.ema is None) and (prefix + "encoder._ema" in state_dict):
-            del state_dict[prefix + "encoder._ema"]
-
-    def remove_pretraining_modules(self, last_layer=None):
-        self.encoder.lm_head = None
-        self.encoder.regression_head = None
-        self.encoder.ema = None
-        self.classification_heads = None
-
-        if last_layer is not None:
-            self.encoder.sentence_encoder.layers = nn.ModuleList(
-                l for i, l in enumerate(self.encoder.sentence_encoder.layers) if i <= last_layer
-            )
-            self.encoder.sentence_encoder.layer_norm = None
-
-
-class Data2VecTextEncoder(FairseqEncoder):
-    def __init__(self, cfg: Data2VecTextConfig, dictionary, task_data):
-        super().__init__(dictionary)
-
-        self.cfg = cfg
-
-        embed_tokens = self.build_embedding(len(dictionary), cfg.transformer.encoder.embed_dim, dictionary.pad())
-
-        self.sentence_encoder = self.build_encoder(cfg, dictionary, embed_tokens)
-        self.mask_idx = dictionary.index("<mask>")
-        assert self.mask_idx != dictionary.unk(), dictionary.symbols
-
-        self.ema = None
-        self.average_top_k_layers = cfg.average_top_k_layers
-        self.loss_scale = cfg.loss_scale
-
-        assert self.cfg.head_layers >= 1
-
-        embed_dim = cfg.transformer.encoder.embed_dim
-        curr_dim = embed_dim
-        projs = []
-        for i in range(self.cfg.head_layers - 1):
-            next_dim = embed_dim * 2 if i == 0 else curr_dim
-            projs.append(nn.Linear(curr_dim, next_dim))
-            projs.append(nn.GELU())
-            curr_dim = next_dim
-
-        projs.append(nn.Linear(curr_dim, embed_dim))
-        self.regression_head = nn.Sequential(*projs)
-
-        self.num_updates = 0
-
-    def build_embedding(self, vocab_size, embedding_dim, padding_idx):
-        return nn.Embedding(vocab_size, embedding_dim, padding_idx)
-
-    def build_encoder(self, cfg, dictionary, embed_tokens):
-        encoder = TransformerEncoder(cfg.transformer, dictionary, embed_tokens, return_fc=True)
-        encoder.apply(init_bert_params)
-        return encoder
-
-    def build_lm_head(self, embed_dim, output_dim, activation_fn, weight):
-        return RobertaLMHead(embed_dim, output_dim, activation_fn, weight)
-
-    def make_ema_teacher(self):
-        ema_config = EMAConfig(
-            ema_decay=self.cfg.ema_decay,
-            ema_fp32=True,
-        )
-        skip_keys = set()
-        if self.cfg.ema_transformer_layers_only:
-            for k, _ in self.sentence_encoder.embed_positions.named_parameters():
-                skip_keys.add(f"embed_tokens.{k}")
-            for k, _ in self.sentence_encoder.embed_positions.named_parameters():
-                skip_keys.add(f"embed_positions.{k}")
-            if self.sentence_encoder.layernorm_embedding is not None:
-                for (
-                    k,
-                    _,
-                ) in self.sentence_encoder.layernorm_embedding.named_parameters():
-                    skip_keys.add(f"layernorm_embedding.{k}")
-            if self.sentence_encoder.layer_norm is not None:
-                for k, _ in self.sentence_encoder.layer_norm.named_parameters():
-                    skip_keys.add(f"layernorm_embedding.{k}")
-
-        self.ema = EMA(
-            self.sentence_encoder,
-            ema_config,
-            skip_keys=skip_keys,
-        )
-
-    def set_num_updates(self, num_updates):
-        super().set_num_updates(num_updates)
-
-        if self.ema is None and self.regression_head is not None:
-            logger.info("making ema teacher")
-            self.make_ema_teacher()
-        elif self.training and self.ema is not None:
-            if self.cfg.ema_decay != self.cfg.ema_end_decay:
-                if num_updates >= self.cfg.ema_anneal_end_step:
-                    decay = self.cfg.ema_end_decay
-                else:
-                    decay = get_annealed_rate(
-                        self.cfg.ema_decay,
-                        self.cfg.ema_end_decay,
-                        num_updates,
-                        self.cfg.ema_anneal_end_step,
-                    )
-                self.ema._set_decay(decay)
-            if self.ema.get_decay() < 1:
-                self.ema.step(self.sentence_encoder)
-
-    def state_dict(self, destination=None, prefix="", keep_vars=False):
-        state = super().state_dict(destination, prefix, keep_vars)
-        if self.ema is not None:
-            state[prefix + "_ema"] = self.ema.fp32_params
-        return state
-
-    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
-        if self.ema is not None:
-            k = prefix + "_ema"
-            assert k in state_dict
-            self.ema.restore(state_dict[k], True)
-            del state_dict[k]
-        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
-
-    def forward(
-        self,
-        src_tokens,
-        target_tokens=None,
-        features_only=False,
-        return_all_hiddens=False,
-        masked_tokens=None,
-        **unused,
-    ):
-        """
-        Args:
-            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
-            features_only (bool, optional): skip LM head and just return
-                features. If True, the output will be of shape `(batch, src_len, embed_dim)`.
-            return_all_hiddens (bool, optional): also return all of the
-                intermediate hidden states (default: False).
-
-        Returns:
-            tuple:
-                - the LM output of shape `(batch, src_len, vocab)`
-                - a dictionary of additional data, where 'inner_states' is a list of hidden states. Note that the
-                  hidden states have shape `(src_len, batch, vocab)`.
-        """
-
-        x, extra = self.extract_features(src_tokens, return_all_hiddens=return_all_hiddens)
-
-        if features_only:
-            return x, extra
-
-        assert target_tokens is not None
-
-        with torch.no_grad():
-            # use EMA parameter as the teacher
-            self.ema.model.eval()
-
-            encoder_out = self.ema.model(
-                target_tokens,
-                return_all_hiddens=True,
-            )
-            y = encoder_out["fc_results"]
-
-            y = y[-self.average_top_k_layers :]
-
-            permuted = False
-            if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer:
-                y = [tl.permute(1, 2, 0) for tl in y]  # TBC -> BCT
-                permuted = True
-
-            if self.cfg.batch_norm_target_layer:
-                y = [F.batch_norm(tl.float(), running_mean=None, running_var=None, training=True) for tl in y]
-
-            if self.cfg.instance_norm_target_layer:
-                y = [F.instance_norm(tl.float()) for tl in y]
-
-            if permuted:
-                y = [tl.transpose(1, 2) for tl in y]  # BCT -> BTC
-
-            if self.cfg.layer_norm_target_layer:
-                y = [F.layer_norm(tl.float(), tl.shape[-1:]) for tl in y]
-
-            y = sum(y) / len(y)
-
-            if not permuted:
-                y = y.transpose(0, 1)
-
-            if self.cfg.layer_norm_targets:
-                y = F.layer_norm(y.float(), y.shape[-1:])
-
-            if self.cfg.instance_norm_targets:
-                y = F.instance_norm(y.transpose(1, 2)).transpose(1, 2)
-
-        masked_indices = src_tokens.eq(self.mask_idx)
-
-        x = x[masked_indices]
-        y = y[masked_indices]
-
-        x = self.regression_head(x)
-
-        sz = x.size(-1)
-        if self.cfg.loss_beta == 0:
-            loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1)
-        else:
-            loss = F.smooth_l1_loss(x.float(), y.float(), reduction="none", beta=self.cfg.loss_beta).sum(dim=-1)
-
-        result = {
-            "losses": {
-                "main": loss.sum() / math.sqrt(sz) if self.loss_scale <= 0 else loss.sum() * self.loss_scale,
-            },
-            "sample_size": loss.numel(),
-        }
-
-        # logging other values
-        other_logs = {"ema_decay": self.ema.get_decay() * 1000}
-        result["logs"] = other_logs
-        return result
-
-    def extract_features(self, src_tokens, return_all_hiddens=False, **kwargs):
-        encoder_out = self.sentence_encoder(
-            src_tokens,
-            return_all_hiddens=return_all_hiddens,
-            token_embeddings=kwargs.get("token_embeddings", None),
-        )
-        # T x B x C -> B x T x C
-        features = encoder_out["encoder_out"][0].transpose(0, 1)
-        inner_states = encoder_out["encoder_states"] if return_all_hiddens else None
-        return features, {
-            "inner_states": inner_states,
-            "encoder_embedding": encoder_out["encoder_embedding"][0],
-        }
-
-    def output_layer(self, features, masked_tokens=None, **unused):
-        return self.lm_head(features, masked_tokens)
-
-    def max_positions(self):
-        """Maximum output length supported by the encoder."""
-        return self.cfg.max_positions
diff --git a/src/transformers/models/data2vec/ema_module.py b/src/transformers/models/data2vec/ema_module.py
deleted file mode 100644
index 98ceeb4e83735..0000000000000
--- a/src/transformers/models/data2vec/ema_module.py
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/usr/bin/env python3
-
-# TODO: Remove file
-
-"""
-Used for EMA tracking a given pytorch module. The user is responsible for calling step() and setting the appropriate
-decay
-"""
-
-import copy
-import logging
-from dataclasses import dataclass, field
-
-import torch
-from fairseq.dataclass import FairseqDataclass
-
-
-@dataclass
-class EMAModuleConfig(FairseqDataclass):
-    ema_decay: float = field(default=0.9999, metadata={"help": "decay for exponential moving average model"})
-    ema_fp32: bool = field(
-        default=False,
-        metadata={"help": "If true, store EMA model in fp32 even if model is in fp16"},
-    )
-
-
-class EMAModule:
-    """Exponential Moving Average of Fairseq Models"""
-
-    def __init__(self, model, config: EMAModuleConfig, device=None, skip_keys=None):
-        """
-        @param model model to initialize the EMA with @param config EMAConfig object with configuration like ema_decay,
-        ema_update_freq, ema_fp32 @param device If provided, copy EMA to this device (e.g. gpu). Otherwise EMA is in
-        the same device as the model.
-        """
-
-        self.decay = config.ema_decay
-        self.model = copy.deepcopy(model)
-        self.model.requires_grad_(False)
-        self.config = config
-        self.skip_keys = skip_keys or set()
-        self.fp32_params = {}
-
-        if device is not None:
-            logging.info(f"Copying EMA model to device {device}")
-            self.model = self.model.to(device=device)
-
-        if self.config.ema_fp32:
-            self.build_fp32_params()
-
-        self.update_freq_counter = 0
-
-    def build_fp32_params(self, state_dict=None):
-        """
-        Store a copy of the EMA params in fp32. If state dict is passed, the EMA params is copied from the provided
-        state dict. Otherwise, it is copied from the current EMA model parameters.
-        """
-        if not self.config.ema_fp32:
-            raise RuntimeError(
-                "build_fp32_params should not be called if ema_fp32=False. "
-                "Use ema_fp32=True if this is really intended."
-            )
-
-        if state_dict is None:
-            state_dict = self.model.state_dict()
-
-        def _to_float(t):
-            return t.float() if torch.is_floating_point(t) else t
-
-        for param_key in state_dict:
-            if param_key in self.fp32_params:
-                self.fp32_params[param_key].copy_(state_dict[param_key])
-            else:
-                self.fp32_params[param_key] = _to_float(state_dict[param_key])
-
-    def restore(self, state_dict, build_fp32_params=False):
-        """Load data from a model spec into EMA model"""
-        self.model.load_state_dict(state_dict, strict=False)
-        if build_fp32_params:
-            self.build_fp32_params(state_dict)
-
-    def set_decay(self, decay):
-        self.decay = decay
-
-    def get_decay(self):
-        return self.decay
-
-    def _step_internal(self, new_model):
-        """One update of the EMA model based on new model weights"""
-        decay = self.decay
-
-        ema_state_dict = {}
-        ema_params = self.fp32_params if self.config.ema_fp32 else self.model.state_dict()
-        for key, param in new_model.state_dict().items():
-            if isinstance(param, dict):
-                continue
-            try:
-                ema_param = ema_params[key]
-            except KeyError:
-                ema_param = param.float().clone() if param.ndim == 1 else copy.deepcopy(param)
-
-            if param.shape != ema_param.shape:
-                raise ValueError(
-                    "incompatible tensor shapes between model param and ema param"
-                    + "{} vs. {}".format(param.shape, ema_param.shape)
-                )
-
-            if "version" in key:
-                # Do not decay a model.version pytorch param
-                continue
-
-            if key in self.skip_keys:
-                ema_param = param.to(dtype=ema_param.dtype).clone()
-                ema_params[key].copy_(ema_param)
-            else:
-                ema_param.mul_(decay)
-                ema_param.add_(param.to(dtype=ema_param.dtype), alpha=1 - decay)
-            ema_state_dict[key] = ema_param
-        self.restore(ema_state_dict, build_fp32_params=False)
-
-    def step(self, new_model):
-        self._step_internal(new_model)
-
-    def reverse(self, model):
-        """
-        Load the model parameters from EMA model. Useful for inference or fine-tuning from the EMA model.
-        """
-        d = self.model.state_dict()
-        if "_ema" in d:
-            del d["_ema"]
-
-        model.load_state_dict(d, strict=False)
-        return model

From 7fb323465b4e4fb4dca7f79001dddaa89f4b1934 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Wed, 23 Feb 2022 21:11:25 -0500
Subject: [PATCH 111/121] Change paper link to arxiv

---
 docs/source/model_doc/data2vec.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
index 6c682e03858f7..3c5ca3dd749c8 100644
--- a/docs/source/model_doc/data2vec.mdx
+++ b/docs/source/model_doc/data2vec.mdx
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The Data2Vec model was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://scontent-sjc3-1.xx.fbcdn.net/v/t39.8562-6/271974914_483120576492438_4239522333319653600_n.pdf?_nc_cat=107&ccb=1-5&_nc_sid=ae5e01&_nc_ohc=7huShTb_QZIAX-N7SYx&_nc_ht=scontent-sjc3-1.xx&oh=00_AT_lXXL69mjqmdVWbaLh4Ro6DY17aFeO5vA9I-mIpyNieg&oe=6205C411) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and Michael Auli.
+The Data2Vec model was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/pdf/2202.03555.pdf) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and Michael Auli.
 Data2Vec proposes a unified framework for self-supervised learning across different data modalities - text, audio and images.
 Importantly, predicted targets for pre-training are contextualized latent representations of the inputs, rather than modality-specific, context-independent targets.
 

From cdf60e890bb33787006620bad9e7198036318c22 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Wed, 23 Feb 2022 21:16:09 -0500
Subject: [PATCH 112/121] Modify Data2Vec documentation to reflect that the
 encoder is not shared across the audio and text models in the current
 implementation.

---
 docs/source/model_doc/data2vec.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
index 3c5ca3dd749c8..ba8c4c559022d 100644
--- a/docs/source/model_doc/data2vec.mdx
+++ b/docs/source/model_doc/data2vec.mdx
@@ -33,8 +33,8 @@ Models and code are available at www.github.com/pytorch/fairseq/tree/master/exam
 
 Tips:
 
-- This implementation has a shared encoder for all different modalities and different pre-processors for each modality.
-  For example, in the case of text, preprocessing is identical to [`RobertaModel`], including tokenization.
+- Both Data2VecAudio and Data2VecText have been trained using the same self-supervised learning method.
+  In the case of Data2VecAudio, preprocessing is identical to [`RobertaModel`], including tokenization.
 
 This model was contributed by [edugp](https://huggingface.co/edugp).
 The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/data2vec).

From 166217f357658cb415f6b10742685508ec5968bb Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Wed, 23 Feb 2022 21:20:24 -0500
Subject: [PATCH 113/121] Update text model checkpoint to be
 facebook/data2vec-text-base

---
 .../models/data2vec/configuration_data2vec_text.py        | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py
index 8cf165ced3db2..0356e6c098756 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_text.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_text.py
@@ -24,7 +24,7 @@
 logger = logging.get_logger(__name__)
 
 DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "edugp/data2vec-nlp-base": "https://huggingface.co/data2vec/resolve/main/config.json",
+    "facebook/data2vec-text-base": "https://huggingface.co/data2vec/resolve/main/config.json",
 }
 
 
@@ -33,7 +33,7 @@ class Data2VecTextConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`Data2VecTextModel`] and [`Data2VecTextModel`]. It
     is used to instantiate a Data2VecText model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2VecText
-    [edugp/data2vec-nlp-base](https://huggingface.co/edugp/data2vec-nlp-base) architecture.
+    [facebook/data2vec-text-base](https://huggingface.co/facebook/data2vec-text-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -84,10 +84,10 @@ class Data2VecTextConfig(PretrainedConfig):
     ```python
     >>> from transformers import Data2VecTextModel, Data2VecTextConfig
 
-    >>> # Initializing a Data2VecText edugp/data2vec-nlp-base style configuration
+    >>> # Initializing a Data2VecText facebook/data2vec-text-base style configuration
     >>> configuration = Data2VecTextConfig()
 
-    >>> # Initializing a model from the edugp/data2vec-nlp-base style configuration
+    >>> # Initializing a model from the facebook/data2vec-text-base style configuration
     >>> model = Data2VecTextModel(configuration)
 
     >>> # Accessing the model configuration

From 98df30110dab37f83095c9fb3bc8574748c56679 Mon Sep 17 00:00:00 2001
From: Eduardo Gonzalez Ponferrada <eduardo@ferrumhealth.com>
Date: Wed, 23 Feb 2022 22:05:49 -0500
Subject: [PATCH 114/121] Add 'Copy from' statements and update paper links and
 docs

---
 docs/source/model_doc/data2vec.mdx            |  2 +-
 .../data2vec/modeling_data2vec_audio.py       | 24 ++++++++++++++++---
 .../models/data2vec/modeling_data2vec_text.py | 12 ++++------
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
index ba8c4c559022d..f84593d0f9a59 100644
--- a/docs/source/model_doc/data2vec.mdx
+++ b/docs/source/model_doc/data2vec.mdx
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The Data2Vec model was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/pdf/2202.03555.pdf) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and Michael Auli.
+The Data2Vec model was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and Michael Auli.
 Data2Vec proposes a unified framework for self-supervised learning across different data modalities - text, audio and images.
 Importantly, predicted targets for pre-training are contextualized latent representations of the inputs, rather than modality-specific, context-independent targets.
 
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 90b8dce841e1d..91ecd7ee7e6a9 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -109,6 +109,7 @@ class Data2VecAudioBaseModelOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.XVectorOutput with Wav2Vec2->Data2VecAudio
 @dataclass
 class XVectorOutput(ModelOutput):
     """
@@ -141,6 +142,7 @@ class XVectorOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices with Wav2Vec2->Data2VecAudio
 def _compute_mask_indices(
     shape: Tuple[int, int],
     mask_prob: float,
@@ -287,6 +289,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer with Wav2Vec2->Data2VecAudio
 class Data2VecAudioPadLayer(nn.Module):
     def __init__(self, conv_pos_kernel_size):
         super().__init__()
@@ -340,6 +343,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->Data2VecAudio
 class Data2VecAudioFeatureEncoder(nn.Module):
     """Construct the features from raw audio waveform"""
 
@@ -382,6 +386,7 @@ def custom_forward(*inputs):
         return hidden_states
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->Data2VecAudio
 class Data2VecAudioFeatureProjection(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -542,6 +547,7 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
+# Copied from transformers.models.wav2vec2modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->Data2VecAudio
 class Data2VecAudioFeedForward(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -566,6 +572,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+# Copied from transformers.models.wav2vec2modeling_wav2vec2.Wav2Vec2EncoderLayer with Bart->Data2VecAudio
 class Data2VecAudioEncoderLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -600,6 +607,7 @@ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
         return outputs
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Encoder with Wav2Vec2->Data2VecAudio
 class Data2VecAudioEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -685,6 +693,7 @@ def custom_forward(*inputs):
         )
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Adapter with Wav2Vec2->Data2VecAudio
 class Data2VecAudioAdapter(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -716,6 +725,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AdapterLayer with Wav2Vec2->Data2VecAudio
 class Data2VecAudioAdapterLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -771,6 +781,7 @@ def _init_weights(self, module):
                 k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                 nn.init.uniform_(module.bias, a=-k, b=k)
 
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2._get_feat_extract_output_lengths with Wav2Vec2->Data2VecAudio
     def _get_feat_extract_output_lengths(
         self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
     ):
@@ -794,6 +805,7 @@ def _conv_out_length(input_length, kernel_size, stride):
 
         return input_lengths
 
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2._get_feature_vector_attention_mask with Wav2Vec2->Data2VecAudio
     def _get_feature_vector_attention_mask(
         self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
     ):
@@ -820,9 +832,9 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 
 DATA2VEC_AUDIO_START_DOCSTRING = r"""
-    Data2VecAudio was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
-    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
-    Auli.
+    Data2VecAudio was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech,
+    Vision and Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu,
+    Jiatao Gu and Michael Auli.
 
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving etc.).
@@ -1013,6 +1025,7 @@ def forward(
         )
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->Data2VecAudio
 @add_start_docstrings(
     """Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     DATA2VEC_AUDIO_START_DOCSTRING,
@@ -1125,6 +1138,7 @@ def forward(
         )
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->Data2VecAudio
 @add_start_docstrings(
     """
     Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks
@@ -1233,6 +1247,7 @@ def forward(
         )
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification with Wav2Vec2->Data2VecAudio
 @add_start_docstrings(
     """
     Data2VecAudio Model with a frame classification head on top for tasks like Speaker Diarization.
@@ -1323,6 +1338,7 @@ def forward(
         )
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss with Wav2Vec2->Data2VecAudio
 class AMSoftmaxLoss(nn.Module):
     def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
         super(AMSoftmaxLoss, self).__init__()
@@ -1346,6 +1362,7 @@ def forward(self, hidden_states, labels):
         return loss
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer with Wav2Vec2->Data2VecAudio
 class TDNNLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -1372,6 +1389,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector with Wav2Vec2->Data2VecAudio
 @add_start_docstrings(
     """
     Data2VecAudio Model with an XVector feature extraction head on top for tasks like Speaker Verification.
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 6e8cd7d82078d..1bf6b7d9ec204 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -54,17 +54,11 @@
 
 _HIDDEN_STATES_START_POSITION = 2
 
+# General docstring
 _CHECKPOINT_FOR_DOC = "facebook/data2vec-text-base"
 _CONFIG_FOR_DOC = "Data2VecTextConfig"
 _TOKENIZER_FOR_DOC = "RobertaTokenizer"
 
-# General docstring
-_CONFIG_FOR_DOC = "Wav2Vec2Config"
-_PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
-
-# CTC docstring
-_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
-_CTC_EXPECTED_LOSS = 53.48
 
 DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "facebook/data2vec-text-base",
@@ -163,6 +157,7 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
         return position_ids.unsqueeze(0).expand(input_shape)
 
 
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->Data2VecText
 class Data2VecTextSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
@@ -629,6 +624,9 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
 
 
 DATA2VECTEXT_START_DOCSTRING = r"""
+    Data2VecText was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech,
+    Vision and Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu,
+    Jiatao Gu and Michael Auli.
 
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads

From 02d9e5edcd062f3a66fd10c6392641e94c10012b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 25 Feb 2022 15:59:06 +0000
Subject: [PATCH 115/121] fix copy from statements

---
 .../data2vec/modeling_data2vec_audio.py       | 30 +++++++++----------
 .../models/data2vec/modeling_data2vec_text.py |  6 ++--
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 91ecd7ee7e6a9..706eb75a5ecd0 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -109,8 +109,8 @@ class Data2VecAudioBaseModelOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.XVectorOutput with Wav2Vec2->Data2VecAudio
 @dataclass
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.XVectorOutput with Wav2Vec2->Data2VecAudio
 class XVectorOutput(ModelOutput):
     """
     Output type of [`Data2VecAudioForXVector`].
@@ -142,7 +142,7 @@ class XVectorOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices with Wav2Vec2->Data2VecAudio
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
 def _compute_mask_indices(
     shape: Tuple[int, int],
     mask_prob: float,
@@ -547,7 +547,7 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-# Copied from transformers.models.wav2vec2modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->Data2VecAudio
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->Data2VecAudio
 class Data2VecAudioFeedForward(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -572,7 +572,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2modeling_wav2vec2.Wav2Vec2EncoderLayer with Bart->Data2VecAudio
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer with Bart->Data2VecAudio
 class Data2VecAudioEncoderLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -781,7 +781,7 @@ def _init_weights(self, module):
                 k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                 nn.init.uniform_(module.bias, a=-k, b=k)
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2._get_feat_extract_output_lengths with Wav2Vec2->Data2VecAudio
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PreTrainedModel._get_feat_extract_output_lengths with
     def _get_feat_extract_output_lengths(
         self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
     ):
@@ -805,7 +805,7 @@ def _conv_out_length(input_length, kernel_size, stride):
 
         return input_lengths
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2._get_feature_vector_attention_mask with Wav2Vec2->Data2VecAudio
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PreTrainedModel._get_feature_vector_attention_mask
     def _get_feature_vector_attention_mask(
         self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
     ):
@@ -832,9 +832,9 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 
 DATA2VEC_AUDIO_START_DOCSTRING = r"""
-    Data2VecAudio was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech,
-    Vision and Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu,
-    Jiatao Gu and Michael Auli.
+    Data2VecAudio was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and
+    Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and
+    Michael Auli.
 
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving etc.).
@@ -1025,11 +1025,11 @@ def forward(
         )
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->Data2VecAudio
 @add_start_docstrings(
     """Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     DATA2VEC_AUDIO_START_DOCSTRING,
 )
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->Data2VecAudio, wav2vec2-data2vec_audio, WAV_2_VEC_2->DATA2VEC_AUDIO
 class Data2VecAudioForCTC(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1138,7 +1138,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->Data2VecAudio
 @add_start_docstrings(
     """
     Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks
@@ -1146,6 +1145,7 @@ def forward(
     """,
     DATA2VEC_AUDIO_START_DOCSTRING,
 )
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->Data2VecAudio, wav2vec2-data2vec_audio, WAV_2_VEC_2->DATA2VEC_AUDIO
 class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1247,13 +1247,13 @@ def forward(
         )
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification with Wav2Vec2->Data2VecAudio
 @add_start_docstrings(
     """
     Data2VecAudio Model with a frame classification head on top for tasks like Speaker Diarization.
     """,
     DATA2VEC_AUDIO_START_DOCSTRING,
 )
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification with Wav2Vec2->Data2VecAudio, wav2vec2-data2vec_audio, WAV_2_VEC_2->DATA2VEC_AUDIO
 class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1338,7 +1338,7 @@ def forward(
         )
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss with Wav2Vec2->Data2VecAudio
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss
 class AMSoftmaxLoss(nn.Module):
     def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
         super(AMSoftmaxLoss, self).__init__()
@@ -1362,7 +1362,7 @@ def forward(self, hidden_states, labels):
         return loss
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer with Wav2Vec2->Data2VecAudio
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer
 class TDNNLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -1389,13 +1389,13 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector with Wav2Vec2->Data2VecAudio
 @add_start_docstrings(
     """
     Data2VecAudio Model with an XVector feature extraction head on top for tasks like Speaker Verification.
     """,
     DATA2VEC_AUDIO_START_DOCSTRING,
 )
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector with Wav2Vec2->Data2VecAudio
 class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 1bf6b7d9ec204..e839a4e4c7728 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -624,9 +624,9 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
 
 
 DATA2VECTEXT_START_DOCSTRING = r"""
-    Data2VecText was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech,
-    Vision and Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu,
-    Jiatao Gu and Michael Auli.
+    Data2VecText was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and
+    Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and
+    Michael Auli.
 
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads

From 5b93a6443f57c74870c639d9164dc342f340845f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 25 Feb 2022 16:01:35 +0000
Subject: [PATCH 116/121] improve copied from

---
 src/transformers/models/data2vec/modeling_data2vec_audio.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 706eb75a5ecd0..bd1d54a04d88a 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -343,7 +343,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->Data2VecAudio
 class Data2VecAudioFeatureEncoder(nn.Module):
     """Construct the features from raw audio waveform"""
 
@@ -355,11 +354,13 @@ def __init__(self, config):
         self.gradient_checkpointing = False
         self._requires_grad = True
 
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder._freeze_parameters
     def _freeze_parameters(self):
         for param in self.parameters():
             param.requires_grad = False
         self._requires_grad = False
 
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder.forward
     def forward(self, input_values):
         hidden_states = input_values[:, None]
 

From a149c18d49bc816bcd563c94924a8d77047fd755 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 25 Feb 2022 16:06:17 +0000
Subject: [PATCH 117/121] correct more copied from statements

---
 .../models/data2vec/modeling_data2vec_audio.py        | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index bd1d54a04d88a..641958ceb3c99 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -81,6 +81,7 @@
 
 
 @dataclass
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Data2VecAudioBaseModelOutput with Wav2Vec2->Data2VecAudio
 class Data2VecAudioBaseModelOutput(ModelOutput):
     """
     Output type of [`Data2VecAudioBaseModelOutput`], with potential hidden states and attentions.
@@ -573,7 +574,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer with Bart->Data2VecAudio
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer with Wav2Vec2->Data2VecAudio
 class Data2VecAudioEncoderLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -1030,7 +1031,7 @@ def forward(
     """Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     DATA2VEC_AUDIO_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->Data2VecAudio, wav2vec2-data2vec_audio, WAV_2_VEC_2->DATA2VEC_AUDIO
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->Data2VecAudio, wav2vec2->data2vec_audio, WAV_2_VEC_2->DATA2VEC_AUDIO
 class Data2VecAudioForCTC(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1146,7 +1147,7 @@ def forward(
     """,
     DATA2VEC_AUDIO_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->Data2VecAudio, wav2vec2-data2vec_audio, WAV_2_VEC_2->DATA2VEC_AUDIO
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->Data2VecAudio, wav2vec2->data2vec_audio, WAV_2_VEC_2->DATA2VEC_AUDIO
 class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1254,7 +1255,7 @@ def forward(
     """,
     DATA2VEC_AUDIO_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification with Wav2Vec2->Data2VecAudio, wav2vec2-data2vec_audio, WAV_2_VEC_2->DATA2VEC_AUDIO
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification with Wav2Vec2->Data2VecAudio, wav2vec2->data2vec_audio, WAV_2_VEC_2->DATA2VEC_AUDIO
 class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1396,7 +1397,7 @@ def forward(self, hidden_states):
     """,
     DATA2VEC_AUDIO_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector with Wav2Vec2->Data2VecAudio
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector with Wav2Vec2->Data2VecAudio, wav2vec2->data2vec_audio, WAV_2_VEC_2->DATA2VEC_AUDIO
 class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)

From 7708caaa64215096f8dd3051a0763aaa6a36ad1a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 25 Feb 2022 16:09:03 +0000
Subject: [PATCH 118/121] finish copied from stuff

---
 .../data2vec/modeling_data2vec_audio.py       | 67 +++++++++++++++++--
 .../models/data2vec/modeling_data2vec_text.py |  4 +-
 2 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 641958ceb3c99..787fe95acbf29 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -81,7 +81,7 @@
 
 
 @dataclass
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Data2VecAudioBaseModelOutput with Wav2Vec2->Data2VecAudio
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2BaseModelOutput with Wav2Vec2->Data2VecAudio
 class Data2VecAudioBaseModelOutput(ModelOutput):
     """
     Output type of [`Data2VecAudioBaseModelOutput`], with potential hidden states and attentions.
@@ -292,9 +292,9 @@ def forward(self, hidden_states):
 
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer with Wav2Vec2->Data2VecAudio
 class Data2VecAudioPadLayer(nn.Module):
-    def __init__(self, conv_pos_kernel_size):
+    def __init__(self, num_conv_pos_embeddings):
         super().__init__()
-        self.num_pad_remove = 1 if conv_pos_kernel_size % 2 == 0 else 0
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
 
     def forward(self, hidden_states):
         if self.num_pad_remove > 0:
@@ -1046,11 +1046,26 @@ def __init__(self, config):
                 "instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
                 "or define `vocab_size` of your model's configuration."
             )
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
         # Initialize weights and apply final processing
         self.post_init()
 
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1152,6 +1167,10 @@ class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Sequence classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)"
+            )
         self.data2vec_audio = Data2VecAudioModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
         if config.use_weighted_layer_sum:
@@ -1162,6 +1181,18 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1260,6 +1291,10 @@ class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Audio frame classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)"
+            )
         self.data2vec_audio = Data2VecAudioModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
         if config.use_weighted_layer_sum:
@@ -1268,6 +1303,18 @@ def __init__(self, config):
 
         self.init_weights()
 
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1418,6 +1465,18 @@ def __init__(self, config):
 
         self.init_weights()
 
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index e839a4e4c7728..0ff73d742363f 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -187,7 +187,7 @@ def __init__(self, config, position_embedding_type=None):
 
     def transpose_for_scores(self, x):
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(*new_x_shape)
+        x = x.view(new_x_shape)
         return x.permute(0, 2, 1, 3)
 
     def forward(
@@ -276,7 +276,7 @@ def forward(
 
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(*new_context_layer_shape)
+        context_layer = context_layer.view(new_context_layer_shape)
 
         outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 

From b9e1fe37ef801e7146c00c4978f95b38ceaa0771 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 25 Feb 2022 16:22:23 +0000
Subject: [PATCH 119/121] make style

---
 src/transformers/models/data2vec/modeling_data2vec_audio.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 787fe95acbf29..dd440ff244526 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -15,6 +15,7 @@
 """ PyTorch Data2VecAudio model."""
 
 import math
+import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 

From 3389304e008df000912100d5249e05c77e83684f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 25 Feb 2022 22:37:15 +0100
Subject: [PATCH 120/121] add model to README

---
 README.md             | 1 +
 README_ko.md          | 1 +
 README_zh-hans.md     | 1 +
 README_zh-hant.md     | 1 +
 docs/source/index.mdx | 1 +
 5 files changed, 5 insertions(+)

diff --git a/README.md b/README.md
index ac34cae1d6cc7..79b24ee7d5fae 100644
--- a/README.md
+++ b/README.md
@@ -249,6 +249,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
diff --git a/README_ko.md b/README_ko.md
index 7472a3099c7bc..32debacea9ee0 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -230,6 +230,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/master/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 74b5f670e0292..174c846c2393a 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -254,6 +254,7 @@ conda install -c huggingface transformers
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/master/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
+1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (来自 Facebook) 伴随论文 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 由 Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index f48cece17b79f..86532189bfeeb 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -266,6 +266,7 @@ conda install -c huggingface transformers
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/master/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 44a53a6e93774..d2b72f32ba160 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -75,6 +75,7 @@ conversion utilities for the following models.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.

From 07167316a033a9e1ff2fe1e2b443764a228c0913 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 25 Feb 2022 22:42:07 +0100
Subject: [PATCH 121/121] add to master

---
 README.md         | 2 +-
 README_ko.md      | 2 +-
 README_zh-hans.md | 2 +-
 README_zh-hant.md | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 79b24ee7d5fae..1fc2e930f2a96 100644
--- a/README.md
+++ b/README.md
@@ -249,7 +249,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[Data2Vec](https://huggingface.co/docs/transformers/master/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
diff --git a/README_ko.md b/README_ko.md
index 32debacea9ee0..26f4d6b1f7b18 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -230,7 +230,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/master/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[Data2Vec](https://huggingface.co/docs/transformers/master/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 174c846c2393a..aac9c77e6e3da 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -254,7 +254,7 @@ conda install -c huggingface transformers
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/master/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
+1. **[Data2Vec](https://huggingface.co/docs/transformers/master/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (来自 Facebook) 伴随论文 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 由 Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 86532189bfeeb..0d52ffa69e872 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -266,7 +266,7 @@ conda install -c huggingface transformers
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/master/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[Data2Vec](https://huggingface.co/docs/transformers/master/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.