[WIP] Add FLAVA model

This PR aims to add [FLAVA](ihttps://arxiv.org/abs/2112.04482) model to the transformers repo. Following checklist delineates the list of things to be done for this PR to be complete: [x] Flava init [x] Flava base models [x] Flava layers [x] Flava Configs [x] Flava encoders [x] Flava pretraining models [ ] Flava classification/retrieval models (in progress) [x] Documentation updates (in progress) [x] Imports updates (in progress) [x] Argstring updates [x] Flava pretrained checkpoints (in progress) [ ] Flava tests [x] Flava processors (in progress) [x] Sanity check [x] Lint
huggingface · Apr 14, 2022 · c1489f5 · c1489f5
1 parent d55fcbc
commit c1489f5
Show file tree

Hide file tree

Showing 20 changed files with 5,011 additions and 0 deletions.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -214,6 +214,8 @@
       title: Encoder Decoder Models
     - local: model_doc/flaubert
       title: FlauBERT
+    - local: model_doc/flava
+      title: FLAVA
     - local: model_doc/fnet
       title: FNet
     - local: model_doc/fsmt

diff --git a/docs/source/en/model_doc/flava.mdx b/docs/source/en/model_doc/flava.mdx
@@ -0,0 +1,93 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# FLAVA
+
+## Overview
+
+The FLAVA model was proposed in [FLAVA: A Foundational Language And Vision Alignment Model
+](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela and is accepted at CVPR 2022.
+
+The paper aims at creating a single unified foundation model which can work across vision, language 
+as well as vision-and-language multimodal tasks.
+
+The abstract from the paper is the following:
+
+State-of-the-art vision and vision-and-language models rely on large-scale visio-linguistic pretraining for obtaining good performance on a variety 
+of downstream tasks. Generally, such models are often either cross-modal (contrastive) or multi-modal 
+(with earlier fusion) but not both; and they often only target specific modalities or tasks. A promising 
+direction would be to use a single holistic universal model, as a "foundation", that targets all modalities 
+at once -- a true vision and language foundation model should be good at vision tasks, language tasks, and 
+cross- and multi-modal vision and language tasks. We introduce FLAVA as such a model and demonstrate 
+impressive performance on a wide range of 35 tasks spanning these target modalities.
+
+
+<!-- Tips: -->
+
+This model was contributed by [aps](https://huggingface.co/aps).
+<!-- The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).-->
+
+
+## FLAVAConfig
+
+[[autodoc]] FLAVAConfig
+    - from_configs
+
+## FLAVATextConfig
+
+[[autodoc]] FLAVATextConfig
+
+## FLAVAImageConfig
+
+[[autodoc]] FLAVAImageConfig
+
+## FLAVAMultimodalConfig
+
+[[autodoc]] FLAVAMultimodalConfig
+
+## FLAVACodebookConfig
+
+[[autodoc]] FLAVACodebookConfig
+
+## FLAVAForPretraining
+
+[[autodoc]] FLAVAForPretraining
+    - forward
+
+## FLAVAModel
+
+[[autodoc]] FLAVAModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## FLAVACodebook
+
+[[autodoc]] FLAVACodebook
+    - forward
+    - get_codebook_indices
+    - get_codebook_probs
+
+## FLAVATextModel
+
+[[autodoc]] FLAVATextModel
+    - forward
+
+## FLAVAImageModel
+
+[[autodoc]] FLAVAImageModel
+    - forward
+
+## FLAVAMultimodalModel
+
+[[autodoc]] FLAVAMultimodalModel
+    - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -191,6 +191,17 @@
     "models.electra": ["ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "ElectraConfig", "ElectraTokenizer"],
     "models.encoder_decoder": ["EncoderDecoderConfig"],
     "models.flaubert": ["FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FlaubertConfig", "FlaubertTokenizer"],
+    "models.flava": [
+        "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "FLAVACodebookConfig",
+        "FLAVACodebookFeatureExtractor",
+        "FLAVAConfig",
+        "FLAVAFeatureExtractor",
+        "FLAVAImageConfig",
+        "FLAVAMultimodalConfig",
+        "FLAVAProcessor",
+        "FLAVATextConfig",
+    ],
     "models.fnet": ["FNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FNetConfig", "FNetTokenizer"],
     "models.fsmt": ["FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FSMTConfig", "FSMTTokenizer"],
     "models.funnel": ["FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP", "FunnelConfig", "FunnelTokenizer"],
@@ -986,6 +997,19 @@
             "FlaubertWithLMHeadModel",
         ]
     )
+    _import_structure["models.flava"].extend(
+        [
+            "FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "FLAVACodebook",
+            "FLAVAForPretraining",
+            "FLAVAImageModel",
+            "FLAVALayer",
+            "FLAVAModel",
+            "FLAVAMultimodalModel",
+            "FLAVAPreTrainedModel",
+            "FLAVATextModel",
+        ]
+    )
     _import_structure["models.fnet"].extend(
         [
             "FNET_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2565,6 +2589,17 @@
     from .models.electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig, ElectraTokenizer
     from .models.encoder_decoder import EncoderDecoderConfig
     from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer
+    from .models.flava import (
+        FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        FLAVACodebookConfig,
+        FLAVACodebookFeatureExtractor,
+        FLAVAConfig,
+        FLAVAFeatureExtractor,
+        FLAVAImageConfig,
+        FLAVAMultimodalConfig,
+        FLAVAProcessor,
+        FLAVATextConfig,
+    )
     from .models.fnet import FNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FNetConfig, FNetTokenizer
     from .models.fsmt import FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP, FSMTConfig, FSMTTokenizer
     from .models.funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig, FunnelTokenizer
@@ -3238,6 +3273,14 @@
             FlaubertModel,
             FlaubertWithLMHeadModel,
         )
+        from .models.flava import (
+            FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            FLAVAForPretraining,
+            FLAVAImageModel,
+            FLAVAModel,
+            FLAVAPreTrainedModel,
+            FLAVATextModel,
+        )
         from .models.fnet import (
             FNET_PRETRAINED_MODEL_ARCHIVE_LIST,
             FNetForMaskedLM,

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -54,6 +54,7 @@
     electra,
     encoder_decoder,
     flaubert,
+    flava,
     fnet,
     fsmt,
     funnel,

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -62,6 +62,7 @@
         ("canine", "CanineConfig"),
         ("roformer", "RoFormerConfig"),
         ("clip", "CLIPConfig"),
+        ("flava", "FLAVAConfig"),
         ("bigbird_pegasus", "BigBirdPegasusConfig"),
         ("deit", "DeiTConfig"),
         ("luke", "LukeConfig"),
@@ -164,6 +165,7 @@
         ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("roformer", "ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("clip", "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("flava", "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bigbird_pegasus", "BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("luke", "LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -258,6 +260,7 @@
         ("canine", "Canine"),
         ("roformer", "RoFormer"),
         ("clip", "CLIP"),
+        ("flava", "flava"),
         ("bigbird_pegasus", "BigBirdPegasus"),
         ("deit", "DeiT"),
         ("luke", "LUKE"),

diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
@@ -47,6 +47,7 @@
         ("detr", "DetrFeatureExtractor"),
         ("layoutlmv2", "LayoutLMv2FeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
+        ("flava", "FLAVAFeatureExtractor"),
         ("perceiver", "PerceiverFeatureExtractor"),
         ("swin", "ViTFeatureExtractor"),
         ("vit_mae", "ViTFeatureExtractor"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -59,6 +59,7 @@
         ("canine", "CanineModel"),
         ("roformer", "RoFormerModel"),
         ("clip", "CLIPModel"),
+        ("flava", "FLAVAModel"),
         ("bigbird_pegasus", "BigBirdPegasusModel"),
         ("deit", "DeiTModel"),
         ("luke", "LukeModel"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -38,6 +38,7 @@
 PROCESSOR_MAPPING_NAMES = OrderedDict(
     [
         ("clip", "CLIPProcessor"),
+        ("flava", "FLAVAProcessor"),
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutxlm", "LayoutXLMProcessor"),
         ("speech_to_text", "Speech2TextProcessor"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -218,6 +218,13 @@
                     "CLIPTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            # (
+            #     "flava",
+            #     (
+            #         "CLIPTokenizer",
+            #         "CLIPTokenizerFast" if is_tokenizers_available() else None,
+            #     ),
+            # ),
             ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
             (
                 "perceiver",

diff --git a/src/transformers/models/flava/__init__.py b/src/transformers/models/flava/__init__.py
@@ -0,0 +1,77 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_flava": [
+        "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "FLAVACodebookConfig",
+        "FLAVAConfig",
+        "FLAVAImageConfig",
+        "FLAVAMultimodalConfig",
+        "FLAVATextConfig",
+    ],
+}
+
+if is_vision_available():
+    _import_structure["feature_extraction_flava"] = ["FLAVACodebookFeatureExtractor", "FLAVAFeatureExtractor"]
+    _import_structure["processing_flava"] = ["FLAVAProcessor"]
+
+if is_torch_available():
+    _import_structure["modeling_flava"] = [
+        "FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "FLAVACodebook",
+        "FLAVAForPretraining",
+        "FLAVAImageModel",
+        "FLAVAModel",
+        "FLAVAMultimodalModel",
+        "FLAVAPreTrainedModel",
+        "FLAVATextModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_flava import (
+        FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        FLAVAConfig,
+        FLAVAImageConfig,
+        FLAVAMultimodalConfig,
+        FLAVATextConfig,
+    )
+
+    if is_vision_available():
+        from .feature_extraction_flava import FLAVACodebookFeatureExtractor, FLAVAFeatureExtractor
+        from .processing_flava import FLAVAProcessor
+
+    if is_torch_available():
+        from .modeling_flava import (
+            FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            FLAVACodebook,
+            FLAVAImageModel,
+            FLAVAModel,
+            FLAVAMultimodalModel,
+            FLAVAPreTrainedModel,
+            FLAVATextModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)