huggingface · patrickvonplaten · Mar 1, 2022 · Feb 3, 2022 · Feb 3, 2022 · Feb 3, 2022
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -162,6 +162,8 @@
       title: CPM
     - local: model_doc/ctrl
       title: CTRL
+    - local: model_doc/data2vec
+      title: Data2Vec
     - local: model_doc/deberta
       title: DeBERTa
     - local: model_doc/deberta-v2

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
@@ -181,6 +181,8 @@ Flax), PyTorch, and/or TensorFlow.
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNext           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |

diff --git a/docs/source/model_doc/data2vec.mdx b/docs/source/model_doc/data2vec.mdx
@@ -0,0 +1,110 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Data2Vec
+
+## Overview
+
+The Data2Vec model was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://scontent-sjc3-1.xx.fbcdn.net/v/t39.8562-6/271974914_483120576492438_4239522333319653600_n.pdf?_nc_cat=107&ccb=1-5&_nc_sid=ae5e01&_nc_ohc=7huShTb_QZIAX-N7SYx&_nc_ht=scontent-sjc3-1.xx&oh=00_AT_lXXL69mjqmdVWbaLh4Ro6DY17aFeO5vA9I-mIpyNieg&oe=6205C411) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and Michael Auli.
+Data2Vec proposes a unified framework for self-supervised learning across different data modalities - text, audio and images.
+Importantly, predicted targets for pre-training are contextualized latent representations of the inputs, rather than modality-specific, context-independent targets.
+
+The abstract from the paper is the following:
+
+*While the general idea of self-supervised learning is identical across modalities, the actual algorithms and
+objectives differ widely because they were developed with a single modality in mind. To get us closer to general
+self-supervised learning, we present data2vec, a framework that uses the same learning method for either speech,
+NLP or computer vision. The core idea is to predict latent representations of the full input data based on a
+masked view of the input in a selfdistillation setup using a standard Transformer architecture.
+Instead of predicting modality-specific targets such as words, visual tokens or units of human speech which
+are local in nature, data2vec predicts contextualized latent representations that contain information from
+the entire input. Experiments on the major benchmarks of speech recognition, image classification, and
+natural language understanding demonstrate a new state of the art or competitive performance to predominant approaches.
+Models and code are available at www.github.com/pytorch/fairseq/tree/master/examples/data2vec.*
+
+Tips:
+
+- This implementation has a shared encoder for all different modalities and different pre-processors for each modality.
+  For example, in the case of text, preprocessing is identical to [`RobertaModel`], including tokenization.
+
+This model was contributed by [edugp](https://huggingface.co/edugp).
+The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/data2vec).
+
+
+## Data2VecTextConfig
+
+[[autodoc]] Data2VecTextConfig
+
+## Data2VecAudioConfig
+
+[[autodoc]] Data2VecAudioConfig
+
+## Data2VecAudioModel
+
+[[autodoc]] Data2VecAudioModel
+    - forward
+
+
+## Data2VecAudioForAudioFrameClassification
+
+[[autodoc]] Data2VecAudioForAudioFrameClassification
+    - forward
+
+## Data2VecAudioForCTC
+
+[[autodoc]] Data2VecAudioForCTC
+    - forward
+
+## Data2VecAudioForSequenceClassification
+
+[[autodoc]] Data2VecAudioForSequenceClassification
+    - forward
+
+## Data2VecAudioForXVector
+
+[[autodoc]] Data2VecAudioForXVector
+    - forward
+
+## Data2VecTextModel
+
+[[autodoc]] Data2VecTextModel
+    - forward
+
+## Data2VecTextForCausalLM
+
+[[autodoc]] Data2VecTextForCausalLM
+    - forward
+
+## Data2VecTextForMaskedLM
+
+[[autodoc]] Data2VecTextForMaskedLM
+    - forward
+
+## Data2VecTextForSequenceClassification
+
+[[autodoc]] Data2VecTextForSequenceClassification
+    - forward
+
+## Data2VecTextForMultipleChoice
+
+[[autodoc]] Data2VecTextForMultipleChoice
+    - forward
+
+## Data2VecTextForTokenClassification
+
+[[autodoc]] Data2VecTextForTokenClassification
+    - forward
+
+## Data2VecTextForQuestionAnswering
+
+[[autodoc]] Data2VecTextForQuestionAnswering
+    - forward
diff --git a/docs/source/serialization.mdx b/docs/source/serialization.mdx
@@ -49,6 +49,7 @@ Ready-made configurations include the following architectures:
 - BART
 - BERT
 - CamemBERT
+- Data2VecText
 - DistilBERT
 - ELECTRA
 - GPT Neo

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -204,6 +204,7 @@
     "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
     "models.cpm": ["CpmTokenizer"],
     "models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
+    "models.data2vec": ["DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecAudioConfig", "Data2VecTextConfig"],
     "models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"],
     "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
     "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
@@ -668,6 +669,7 @@
     _import_structure["models.auto"].extend(
         [
             "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
+            "MODEL_FOR_AUDIO_XVECTOR_MAPPING",
             "MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
             "MODEL_FOR_CAUSAL_LM_MAPPING",
             "MODEL_FOR_CTC_MAPPING",
@@ -870,6 +872,26 @@
             "CTRLPreTrainedModel",
         ]
     )
+    _import_structure["models.data2vec"].extend(
+        [
+            "DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "Data2VecAudioForAudioFrameClassification",
+            "Data2VecAudioForCTC",
+            "Data2VecAudioForSequenceClassification",
+            "Data2VecAudioForXVector",
+            "Data2VecAudioModel",
+            "Data2VecAudioPreTrainedModel",
+            "Data2VecTextForCausalLM",
+            "Data2VecTextForMaskedLM",
+            "Data2VecTextForMultipleChoice",
+            "Data2VecTextForQuestionAnswering",
+            "Data2VecTextForSequenceClassification",
+            "Data2VecTextForTokenClassification",
+            "Data2VecTextModel",
+            "Data2VecTextPreTrainedModel",
+        ]
+    )
     _import_structure["models.deberta"].extend(
         [
             "DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2459,6 +2481,7 @@
     from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
     from .models.cpm import CpmTokenizer
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
+    from .models.data2vec import DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecAudioConfig, Data2VecTextConfig
     from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer
     from .models.deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
@@ -2838,6 +2861,7 @@
         )
         from .models.auto import (
             MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
+            MODEL_FOR_AUDIO_XVECTOR_MAPPING,
             MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
             MODEL_FOR_CAUSAL_LM_MAPPING,
             MODEL_FOR_CTC_MAPPING,
@@ -3011,6 +3035,24 @@
             CTRLModel,
             CTRLPreTrainedModel,
         )
+        from .models.data2vec import (
+            DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Data2VecAudioForAudioFrameClassification,
+            Data2VecAudioForCTC,
+            Data2VecAudioForSequenceClassification,
+            Data2VecAudioForXVector,
+            Data2VecAudioModel,
+            Data2VecAudioPreTrainedModel,
+            Data2VecTextForCausalLM,
+            Data2VecTextForMaskedLM,
+            Data2VecTextForMultipleChoice,
+            Data2VecTextForQuestionAnswering,
+            Data2VecTextForSequenceClassification,
+            Data2VecTextForTokenClassification,
+            Data2VecTextModel,
+            Data2VecTextPreTrainedModel,
+        )
         from .models.deberta import (
             DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
             DebertaForMaskedLM,

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -40,6 +40,7 @@
     convnext,
     cpm,
     ctrl,
+    data2vec,
     deberta,
     deberta_v2,
     deit,

diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
@@ -32,6 +32,7 @@
 if is_torch_available():
     _import_structure["modeling_auto"] = [
         "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
+        "MODEL_FOR_AUDIO_XVECTOR_MAPPING",
         "MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
         "MODEL_FOR_CAUSAL_LM_MAPPING",
         "MODEL_FOR_CTC_MAPPING",
@@ -150,6 +151,7 @@
     if is_torch_available():
         from .modeling_auto import (
             MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
+            MODEL_FOR_AUDIO_XVECTOR_MAPPING,
             MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
             MODEL_FOR_CAUSAL_LM_MAPPING,
             MODEL_FOR_CTC_MAPPING,

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -121,6 +121,8 @@
         ("unispeech-sat", "UniSpeechSatConfig"),
         ("unispeech", "UniSpeechConfig"),
         ("wavlm", "WavLMConfig"),
+        ("data2vec-audio", "Data2VecAudioConfig"),
+        ("data2vec-text", "Data2VecTextConfig"),
     ]
 )
 
@@ -177,6 +179,8 @@
         ("xlnet", "XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xlm", "XLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("roberta", "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("data2vec-text", "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("data2vec-audio", "DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("distilbert", "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -321,10 +325,14 @@
         ("xlsr_wav2vec2", "XLSR-Wav2Vec2"),
         ("mluke", "mLUKE"),
         ("layoutxlm", "LayoutXLM"),
+        ("data2vec-audio", "Data2VecAudio"),
+        ("data2vec-text", "Data2VecText"),
     ]
 )
 
-SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict([("openai-gpt", "openai")])
+SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
+    [("openai-gpt", "openai"), ("data2vec-audio", "data2vec"), ("data2vec-text", "data2vec")]
+)
 
 
 def model_type_to_module_name(key):

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -83,6 +83,8 @@
         ("bart", "BartModel"),
         ("longformer", "LongformerModel"),
         ("roberta", "RobertaModel"),
+        ("data2vec-text", "Data2VecTextModel"),
+        ("data2vec-audio", "Data2VecAudioModel"),
         ("layoutlm", "LayoutLMModel"),
         ("squeezebert", "SqueezeBertModel"),
         ("bert", "BertModel"),
@@ -133,6 +135,7 @@
         ("fsmt", "FSMTForConditionalGeneration"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
+        ("data2vec-text", "Data2VecTextForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForPreTraining"),
         ("big_bird", "BigBirdForPreTraining"),
@@ -191,6 +194,7 @@
         ("bart", "BartForConditionalGeneration"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
+        ("data2vec-text", "Data2VecTextForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForMaskedLM"),
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
@@ -251,6 +255,7 @@
         ("blenderbot-small", "BlenderbotSmallForCausalLM"),
         ("megatron-bert", "MegatronBertForCausalLM"),
         ("speech_to_text_2", "Speech2Text2ForCausalLM"),
+        ("data2vec-text", "Data2VecTextForCausalLM"),
     ]
 )
 
@@ -337,6 +342,7 @@
         ("xlm-roberta", "XLMRobertaForMaskedLM"),
         ("longformer", "LongformerForMaskedLM"),
         ("roberta", "RobertaForMaskedLM"),
+        ("data2vec-text", "Data2VecTextForMaskedLM"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("bert", "BertForMaskedLM"),
         ("megatron-bert", "MegatronBertForMaskedLM"),
@@ -417,6 +423,7 @@
         ("bart", "BartForSequenceClassification"),
         ("longformer", "LongformerForSequenceClassification"),
         ("roberta", "RobertaForSequenceClassification"),
+        ("data2vec-text", "Data2VecTextForSequenceClassification"),
         ("squeezebert", "SqueezeBertForSequenceClassification"),
         ("layoutlm", "LayoutLMForSequenceClassification"),
         ("bert", "BertForSequenceClassification"),
@@ -482,6 +489,7 @@
         ("deberta-v2", "DebertaV2ForQuestionAnswering"),
         ("ibert", "IBertForQuestionAnswering"),
         ("splinter", "SplinterForQuestionAnswering"),
+        ("data2vec-text", "Data2VecTextForQuestionAnswering"),
     ]
 )
 
@@ -527,6 +535,7 @@
         ("deberta-v2", "DebertaV2ForTokenClassification"),
         ("gpt2", "GPT2ForTokenClassification"),
         ("ibert", "IBertForTokenClassification"),
+        ("data2vec-text", "Data2VecTextForTokenClassification"),
     ]
 )
 
@@ -548,6 +557,7 @@
         ("xlm-roberta", "XLMRobertaForMultipleChoice"),
         ("longformer", "LongformerForMultipleChoice"),
         ("roberta", "RobertaForMultipleChoice"),
+        ("data2vec-text", "Data2VecTextForMultipleChoice"),
         ("squeezebert", "SqueezeBertForMultipleChoice"),
         ("bert", "BertForMultipleChoice"),
         ("distilbert", "DistilBertForMultipleChoice"),
@@ -583,6 +593,7 @@
         ("sew", "SEWForSequenceClassification"),
         ("sew-d", "SEWDForSequenceClassification"),
         ("wavlm", "WavLMForSequenceClassification"),
+        ("data2vec-audio", "Data2VecAudioForSequenceClassification"),
     ]
 )
 
@@ -596,6 +607,7 @@
         ("sew", "SEWForCTC"),
         ("sew-d", "SEWDForCTC"),
         ("wavlm", "WavLMForCTC"),
+        ("data2vec-audio", "Data2VecAudioForCTC"),
     ]
 )
 
@@ -605,6 +617,7 @@
         ("wav2vec2", "Wav2Vec2ForAudioFrameClassification"),
         ("unispeech-sat", "UniSpeechSatForAudioFrameClassification"),
         ("wavlm", "WavLMForAudioFrameClassification"),
+        ("data2vec-audio", "Data2VecAudioForAudioFrameClassification"),
     ]
 )
 
@@ -614,6 +627,7 @@
         ("wav2vec2", "Wav2Vec2ForXVector"),
         ("unispeech-sat", "UniSpeechSatForXVector"),
         ("wavlm", "WavLMForXVector"),
+        ("data2vec-audio", "Data2VecAudioForXVector"),
     ]
 )