From d2a084835fcf46306e768827c5811614800345a4 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 8 Feb 2022 21:33:40 +0530
Subject: [PATCH 01/65] feat: initial implementation of convnext in tensorflow.

---
 .../models/convnext/modeling_tf_convnext.py   | 504 ++++++++++++++++++
 1 file changed, 504 insertions(+)
 create mode 100644 src/transformers/models/convnext/modeling_tf_convnext.py

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
new file mode 100644
index 0000000000000..e67088ba6d7ea
--- /dev/null
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -0,0 +1,504 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms Inc., Sayak Paul, and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 ConvNext model."""
+
+import collections.abc
+import math
+from typing import Dict, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling, TFSequenceClassifierOutput
+from ...modeling_tf_utils import (
+    TFModelInputType,
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    get_initializer,
+    input_processing,
+    keras_serializable,
+    shape_list,
+)
+from ...utils import logging
+from .configuration_convnext import ConvNextConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_CONFIG_FOR_DOC = "ConvNextConfig"
+_CHECKPOINT_FOR_DOC = "facebook/ConvNext-tiny-224"
+
+
+class TFConvNextDropPath(tf.keras.layers.Layer):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    References:
+        (1) github.com:rwightman/pytorch-image-models
+    """
+
+    def __init__(self, drop_path, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_path = drop_path
+
+    def call(self, x, training=None):
+        if training:
+            keep_prob = 1 - self.drop_path
+            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
+            random_tensor = tf.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
+
+
+class TFConvNextEmbeddings(tf.keras.layers.Layer):
+    """This class is comparable to (and inspired by) the SwinEmbeddings class
+    found in src/transformers/models/swin/modeling_swin.py.
+    """
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.patch_embeddings = tf.keras.layers.Conv2D(
+            filters=config.hidden_sizes[0],
+            kernel_size=config.patch_size,
+            strides=config.patch_size,
+            name="patch_embeddings",
+            kernel_initializer=get_initializer(self.config.initializer_range),
+            bias_initializer="zeros",
+        )
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
+
+    def call(self, pixel_values):
+        # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format.
+        # So change the input format from `NCHW` to `NHWC`.
+        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+
+        embeddings = self.patch_embeddings(pixel_values)
+        embeddings = self.layernorm(embeddings)
+        return embeddings
+
+
+class TFConvNextLayer(tf.keras.layers.Layer):
+    """This corresponds to the `Block` class in the original implementation.
+
+    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
+    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back
+
+    The authors used (2) as they find it slightly faster in PyTorch. Since we already permuted the inputs to
+    follow NHWC ordering, we can just apply the operations straight-away without the permutation.
+
+    Args:
+        config ([`ConvNextConfig`]): Model configuration class.
+        dim (`int`): Number of input channels.
+        drop_path (`float`): Stochastic depth rate. Default: 0.0.
+    """
+
+    def __init__(self, config, dim, drop_path=0, **kwargs):
+        # (sayakpaul): need to figure out the layer names.
+        super().__init__(**kwargs)
+        self.dwconv = tf.keras.layers.Conv2D(
+            filters=dim,
+            kernel_size=7,
+            padding="same",
+            groups=dim,
+            kernel_initializer=get_initializer(self.config.initializer_range),
+            bias_initializer="zeros",
+        )  # depthwise conv
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self.pwconv1 = tf.keras.layers.Dense(
+            units=4 * dim,
+            kernel_initializer=get_initializer(self.config.initializer_range),
+            bias_initializer="zeros",
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = get_tf_activation[config.hidden_act]
+        self.pwconv2 = tf.keras.layers.Dense(
+            units=dim,
+            kernel_initializer=get_initializer(self.config.initializer_range),
+            bias_initializer="zeros",
+        )
+        self.layer_scale_parameter = (
+            tf.Variable(config.layer_scale_init_value * tf.ones((dim,)), trainable=True, name="layer_scale_parameter")
+            if config.layer_scale_init_value > 0
+            else None
+        )
+        self.drop_path = TFConvNextDropPath(drop_path) if drop_path > 0.0 else tf.identity
+
+    def call(self, hidden_states):
+        input = hidden_states
+        x = self.dwconv(hidden_states)
+        x = self.layernorm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+
+        if self.layer_scale_parameter is not None:
+            x = self.layer_scale_parameter * x
+
+        x = input + self.drop_path(x)
+        return x
+
+
+class TFConvNextStage(tf.keras.layers.Layer):
+    """ConvNext stage, consisting of an optional downsampling layer + multiple residual blocks.
+
+    Args:
+        config ([`ConvNextConfig`]): Model configuration class.
+        in_channels (`int`): Number of input channels.
+        out_channels (`int`): Number of output channels.
+        depth (`int`): Number of residual blocks.
+        drop_path_rates(`List[float]`): Stochastic depth rates for each layer.
+    """
+
+    def __init__(
+        self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs
+    ):
+        # (sayakpaul): need to figure out the names.
+        super().__init__(**kwargs)
+
+        if in_channels != out_channels or stride > 1:
+            self.downsampling_layer = tf.keras.Sequential(
+                [
+                    tf.keras.layers.LayerNormalization(epsilon=1e-6),
+                    tf.keras.layers.Conv2D(
+                        filters=out_channels,
+                        kernel_size=kernel_size,
+                        strides=stride,
+                        kernel_initializer=get_initializer(self.config.initializer_range),
+                        bias_initializer="zeros",
+                    ),
+                ]
+            )
+        else:
+            self.downsampling_layer = tf.identity
+        drop_path_rates = drop_path_rates or [0.0] * depth
+        self.layers = tf.keras.Sequential(
+            [*[TFConvNextLayer(config, dim=out_channels, drop_path=drop_path_rates[j]) for j in range(depth)]]
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.downsampling_layer(hidden_states)
+        hidden_states = self.layers(hidden_states)
+        return hidden_states
+
+
+class TFConvNextEncoder(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        # (sayakpaul): need to figure out the naming convention for `dwconv`,
+        # `pwconv1`, and `pwconv2`.
+        super().__init__(**kwargs)
+        self.stages = []
+        drop_path_rates = [x.item() for x in tf.linspace(0, config.drop_path_rate, sum(config.depths))]
+        cur = 0
+        prev_chs = config.hidden_sizes[0]
+        for i in range(config.num_stages):
+            out_chs = config.hidden_sizes[i]
+            stage = TFConvNextStage(
+                config,
+                in_channels=prev_chs,
+                out_channels=out_chs,
+                stride=2 if i > 0 else 1,
+                depth=config.depths[i],
+                drop_path_rates=drop_path_rates[cur],
+            )
+            self.stages.append(stage)
+            cur += config.depths[i]
+            prev_chs = out_chs
+
+    def call(self, hidden_states, output_hidden_states=False, return_dict=True):
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, layer_module in enumerate(self.stages):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            hidden_states = layer_module(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+        )
+
+
+class TFConvNextPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ConvNextConfig
+    base_model_prefix = "convnext"
+    main_input_name = "pixel_values"
+
+    @property
+    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
+        """
+        Dummy inputs to build the network.
+
+        Returns:
+            `Dict[str, tf.Tensor]`: The dummy inputs.
+        """
+        VISION_DUMMY_INPUTS = tf.random.uniform(
+            shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size), dtype=tf.float32
+        )
+        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
+
+    @tf.function(
+        input_signature=[
+            {
+                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
+            }
+        ]
+    )
+    def serving(self, inputs):
+        """
+        Method used for serving the model.
+
+        Args:
+            inputs (`Dict[str, tf.Tensor]`):
+                The input of the saved model as a dictionary of tensors.
+        """
+        return self.call(inputs)
+
+
+CONVNEXT_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TF 2.0 models accepts two formats as inputs:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional arguments.
+
+    This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the
+    tensors in the first argument of the model call function: `model(inputs)`.
+
+    </Tip>
+
+    Parameters:
+        config ([`ConvNextConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CONVNEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`ConvNextFeatureExtractor`]. See
+            [`ConvNextFeatureExtractor.__call__`] for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used
+            in eager mode, in graph mode the value will always be set to True.
+"""
+
+
+@add_start_docstrings(
+    "The bare ConvNext model outputting raw features without any specific head on top.",
+    CONVNEXT_START_DOCSTRING,
+)
+class TFConvNextModel(TFConvNextPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.config = config
+
+        self.embeddings = TFConvNextEmbeddings(config)
+        self.encoder = TFConvNextEncoder(config)
+
+        # final layernorm layer
+        self.layernorm = tf.keras.layers.Layer(epsilon=config.layer_norm_eps)
+
+        # global average pooling
+        self.pooler = tf.keras.layers.GlobalAvgPool2D()
+
+    @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        pixel_values: Optional[TFModelInputType] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        **kwargs,
+    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConvNextFeatureExtractor, TFConvNextModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224")
+        >>> model = TFConvNextModel.from_pretrained("facebook/convnext-tiny-224")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="tf")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        if "input_ids" in inputs:
+            inputs["pixel_values"] = inputs.pop("input_ids")
+
+        if inputs["pixel_values"] is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        pooled_output = self.layernorm(self.pooler(last_hidden_state))
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    CONVNEXT_START_DOCSTRING,
+)
+class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config: ConvNextConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.num_labels = config.num_labels
+        self.convnext = TFConvNextModel(config)
+
+        # Classifier head
+        self.classifier = tf.keras.layers.Dense(
+            units=config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="classifier",
+        )
+
+    @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        pixel_values: Optional[TFModelInputType] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        training: Optional[bool] = False,
+        **kwargs,
+    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConvNextFeatureExtractor, TFConvNextForImageClassification
+        >>> import tensorflow as tf
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ViTFeatureExtractor.from_pretrained("facebook/convnext-tiny-224")
+        >>> model = TFViTForImageClassification.from_pretrained("facebook/convnext-tiny-224")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="tf")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        >>> # model predicts one of the 1000 ImageNet classes
+        >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
+        >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
+        ```"""
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            labels=labels,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        if "input_ids" in inputs:
+            inputs["pixel_values"] = inputs.pop("input_ids")
+
+        outputs = self.convnext(
+            inputs["pixel_values"], output_hidden_states=output_hidden_states, return_dict=return_dict
+        )
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output)
+        loss = None if inputs["labels"] is None else self.hf_compute_loss(labels=inputs["labels"], logits=logits)
+
+        if not inputs["return_dict"]:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )

From 583769c722162fecf30f7c6ad05450996cf38f3c Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 9 Feb 2022 07:23:05 +0530
Subject: [PATCH 02/65] fix: sample code for the classification model.

---
 src/transformers/models/convnext/modeling_tf_convnext.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index e67088ba6d7ea..52e10b4464202 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -460,7 +460,7 @@ def call(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ViTFeatureExtractor.from_pretrained("facebook/convnext-tiny-224")
+        >>> feature_extractor = ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224")
         >>> model = TFViTForImageClassification.from_pretrained("facebook/convnext-tiny-224")
 
         >>> inputs = feature_extractor(images=image, return_tensors="tf")

From c667d93e6a49bd3eb327d1572178792842e3cd27 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 9 Feb 2022 07:30:52 +0530
Subject: [PATCH 03/65] chore: added checked for  from the classification
 model.

---
 src/transformers/models/convnext/modeling_tf_convnext.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 52e10b4464202..39600954c8df6 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -484,6 +484,9 @@ def call(
         if "input_ids" in inputs:
             inputs["pixel_values"] = inputs.pop("input_ids")
 
+        if inputs["pixel_values"] is None:
+            raise ValueError("You have to specify pixel_values")
+
         outputs = self.convnext(
             inputs["pixel_values"], output_hidden_states=output_hidden_states, return_dict=return_dict
         )

From 7aecfa9a1f04f9b75eaa5177d5866079702b4d44 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 9 Feb 2022 07:32:25 +0530
Subject: [PATCH 04/65] chore: set bias initializer in the classification head.

---
 src/transformers/models/convnext/modeling_tf_convnext.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 39600954c8df6..b265de9e406af 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -427,6 +427,7 @@ def __init__(self, config: ConvNextConfig, *inputs, **kwargs):
         self.classifier = tf.keras.layers.Dense(
             units=config.num_labels,
             kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer="zeros",
             name="classifier",
         )
 

From 222c46546568d1c6de6ce4f0aa137d51b32e9527 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 9 Feb 2022 08:02:13 +0530
Subject: [PATCH 05/65] chore: updated license terms.

---
 src/transformers/models/convnext/modeling_tf_convnext.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index b265de9e406af..8defe1e063d79 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Meta Platforms Inc., Sayak Paul, and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 Meta Platforms Inc. and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 835dbdb99041395b368f468a0f0fdf5350c07e46 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 9 Feb 2022 08:45:25 +0530
Subject: [PATCH 06/65] chore: removed ununsed imports

---
 src/transformers/models/convnext/modeling_tf_convnext.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 8defe1e063d79..4bf7d42f3deb9 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 """ TF 2.0 ConvNext model."""
 
-import collections.abc
-import math
+
 from typing import Dict, Optional, Tuple, Union
 
 import numpy as np
@@ -30,8 +29,6 @@
     TFSequenceClassificationLoss,
     get_initializer,
     input_processing,
-    keras_serializable,
-    shape_list,
 )
 from ...utils import logging
 from .configuration_convnext import ConvNextConfig

From d6f91b64d125f6aba976848546fd55e6ed2fe7cb Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 9 Feb 2022 09:24:18 +0530
Subject: [PATCH 07/65] feat: enabled  argument during using drop_path.

---
 src/transformers/models/convnext/modeling_tf_convnext.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 4bf7d42f3deb9..3fccfccc4720f 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -134,7 +134,7 @@ def __init__(self, config, dim, drop_path=0, **kwargs):
         )
         self.drop_path = TFConvNextDropPath(drop_path) if drop_path > 0.0 else tf.identity
 
-    def call(self, hidden_states):
+    def call(self, hidden_states, training=False):
         input = hidden_states
         x = self.dwconv(hidden_states)
         x = self.layernorm(x)
@@ -145,7 +145,7 @@ def call(self, hidden_states):
         if self.layer_scale_parameter is not None:
             x = self.layer_scale_parameter * x
 
-        x = input + self.drop_path(x)
+        x = input + self.drop_path(x, training=training)
         return x
 
 

From e1fec885706629faf6c520c1d34a6532d551ba9a Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 10 Feb 2022 12:12:24 +0530
Subject: [PATCH 08/65] chore: replaced tf.identity with
 layers.Activation(linear).

---
 src/transformers/models/convnext/modeling_tf_convnext.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 3fccfccc4720f..ebc2a81f3d3a6 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -132,7 +132,9 @@ def __init__(self, config, dim, drop_path=0, **kwargs):
             if config.layer_scale_init_value > 0
             else None
         )
-        self.drop_path = TFConvNextDropPath(drop_path) if drop_path > 0.0 else tf.identity
+        # Using `layers.Activation` instead of `tf.identity` to better control `training`
+        # behaviour.
+        self.drop_path = TFConvNextDropPath(drop_path) if drop_path > 0.0 else tf.keras.layers.Activation("linear")
 
     def call(self, hidden_states, training=False):
         input = hidden_states

From 30e4bcb6f4f288b1004557ad93b56f43b02f0384 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 11 Feb 2022 11:09:00 +0530
Subject: [PATCH 09/65] chore: edited default checkpoint.

---
 src/transformers/models/convnext/modeling_tf_convnext.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index ebc2a81f3d3a6..52817a509cb67 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -38,7 +38,7 @@
 
 
 _CONFIG_FOR_DOC = "ConvNextConfig"
-_CHECKPOINT_FOR_DOC = "facebook/ConvNext-tiny-224"
+_CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224"
 
 
 class TFConvNextDropPath(tf.keras.layers.Layer):

From b0051acee0066ef7c7b9595224e0c78506b11b78 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 11 Feb 2022 22:29:11 +0530
Subject: [PATCH 10/65] fix: minor bugs in the initializations.

---
 .../models/convnext/modeling_tf_convnext.py    | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 52817a509cb67..bb99f472bc1dd 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -73,7 +73,7 @@ def __init__(self, config, **kwargs):
             kernel_size=config.patch_size,
             strides=config.patch_size,
             name="patch_embeddings",
-            kernel_initializer=get_initializer(self.config.initializer_range),
+            kernel_initializer=get_initializer(config.initializer_range),
             bias_initializer="zeros",
         )
         self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
@@ -104,7 +104,7 @@ class TFConvNextLayer(tf.keras.layers.Layer):
         drop_path (`float`): Stochastic depth rate. Default: 0.0.
     """
 
-    def __init__(self, config, dim, drop_path=0, **kwargs):
+    def __init__(self, config, dim, drop_path=0.0, **kwargs):
         # (sayakpaul): need to figure out the layer names.
         super().__init__(**kwargs)
         self.dwconv = tf.keras.layers.Conv2D(
@@ -112,19 +112,19 @@ def __init__(self, config, dim, drop_path=0, **kwargs):
             kernel_size=7,
             padding="same",
             groups=dim,
-            kernel_initializer=get_initializer(self.config.initializer_range),
+            kernel_initializer=get_initializer(config.initializer_range),
             bias_initializer="zeros",
         )  # depthwise conv
         self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
         self.pwconv1 = tf.keras.layers.Dense(
             units=4 * dim,
-            kernel_initializer=get_initializer(self.config.initializer_range),
+            kernel_initializer=get_initializer(config.initializer_range),
             bias_initializer="zeros",
         )  # pointwise/1x1 convs, implemented with linear layers
-        self.act = get_tf_activation[config.hidden_act]
+        self.act = get_tf_activation(config.hidden_act)
         self.pwconv2 = tf.keras.layers.Dense(
             units=dim,
-            kernel_initializer=get_initializer(self.config.initializer_range),
+            kernel_initializer=get_initializer(config.initializer_range),
             bias_initializer="zeros",
         )
         self.layer_scale_parameter = (
@@ -176,7 +176,7 @@ def __init__(
                         filters=out_channels,
                         kernel_size=kernel_size,
                         strides=stride,
-                        kernel_initializer=get_initializer(self.config.initializer_range),
+                        kernel_initializer=get_initializer(config.initializer_range),
                         bias_initializer="zeros",
                     ),
                 ]
@@ -200,7 +200,7 @@ def __init__(self, config, **kwargs):
         # `pwconv1`, and `pwconv2`.
         super().__init__(**kwargs)
         self.stages = []
-        drop_path_rates = [x.item() for x in tf.linspace(0, config.drop_path_rate, sum(config.depths))]
+        drop_path_rates = [x for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))]
         cur = 0
         prev_chs = config.hidden_sizes[0]
         for i in range(config.num_stages):
@@ -335,7 +335,7 @@ def __init__(self, config, *inputs, **kwargs):
         self.encoder = TFConvNextEncoder(config)
 
         # final layernorm layer
-        self.layernorm = tf.keras.layers.Layer(epsilon=config.layer_norm_eps)
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps)
 
         # global average pooling
         self.pooler = tf.keras.layers.GlobalAvgPool2D()

From aeb14f7329d9543c3b661853b8b6c8f5427fca37 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sat, 12 Feb 2022 01:10:18 +0530
Subject: [PATCH 11/65] partial-fix: tf model errors for loading pretrained pt
 weights.

---
 src/transformers/modeling_tf_utils.py                | 12 ++++++++----
 .../models/convnext/configuration_convnext.py        |  3 +++
 .../models/convnext/modeling_tf_convnext.py          |  2 ++
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 54f465215fb2c..be54906fbda6b 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -309,9 +309,13 @@ def booleans_processing(config, **kwargs):
     final_booleans = {}
 
     if tf.executing_eagerly():
-        final_booleans["output_attentions"] = (
-            kwargs["output_attentions"] if kwargs["output_attentions"] is not None else config.output_attentions
-        )
+        # final_booleans["output_attentions"] = (
+        #     kwargs["output_attentions"] if kwargs["output_attentions"] else config.output_attentions
+        # )
+        final_booleans["output_attentions"] = kwargs.get("output_attentions", None)
+        if not final_booleans["output_attentions"]:
+            final_booleans["output_attentions"] = config.output_attentions
+
         final_booleans["output_hidden_states"] = (
             kwargs["output_hidden_states"]
             if kwargs["output_hidden_states"] is not None
@@ -1827,7 +1831,7 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optiona
         super().__init__(**kwargs)
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
-        self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
+        self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
 
     def build(self, input_shape):
         """
diff --git a/src/transformers/models/convnext/configuration_convnext.py b/src/transformers/models/convnext/configuration_convnext.py
index 8d99c657cc639..c09a54e86a7e2 100644
--- a/src/transformers/models/convnext/configuration_convnext.py
+++ b/src/transformers/models/convnext/configuration_convnext.py
@@ -85,6 +85,7 @@ def __init__(
         is_encoder_decoder=False,
         layer_scale_init_value=1e-6,
         drop_path_rate=0.0,
+        image_size=224,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -99,3 +100,5 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.layer_scale_init_value = layer_scale_init_value
         self.drop_path_rate = drop_path_rate
+        self.image_size = image_size
+        self.output_attentions = None
diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index bb99f472bc1dd..51f27a8d88a70 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -370,6 +370,8 @@ def call(
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
         ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         inputs = input_processing(
             func=self.call,
             config=self.config,

From aec69dcf1e052aa8e84c731b8dab99c1afaf1229 Mon Sep 17 00:00:00 2001
From: ariG23498 <aritra.born2fly@gmail.com>
Date: Sat, 12 Feb 2022 02:40:55 +0530
Subject: [PATCH 12/65] partial-fix: call method updated

---
 src/transformers/models/convnext/modeling_tf_convnext.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 51f27a8d88a70..db0d8424576ff 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -188,7 +188,7 @@ def __init__(
             [*[TFConvNextLayer(config, dim=out_channels, drop_path=drop_path_rates[j]) for j in range(depth)]]
         )
 
-    def forward(self, hidden_states):
+    def call(self, hidden_states):
         hidden_states = self.downsampling_layer(hidden_states)
         hidden_states = self.layers(hidden_states)
         return hidden_states

From 6c0fae263457b7e496ee331716a0079023fb07af Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sat, 12 Feb 2022 12:03:14 +0530
Subject: [PATCH 13/65] partial-fix: cross loading of weights (4x3 variables to
 be matched)

---
 .../models/convnext/modeling_tf_convnext.py   | 56 ++++++++++++++-----
 1 file changed, 43 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index db0d8424576ff..8a8946575a26f 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -68,6 +68,8 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer):
 
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
+        # note that we do not use the `base_name` here in `patch_embeddings`
+        # and `layernorm`
         self.patch_embeddings = tf.keras.layers.Conv2D(
             filters=config.hidden_sizes[0],
             kernel_size=config.patch_size,
@@ -105,8 +107,8 @@ class TFConvNextLayer(tf.keras.layers.Layer):
     """
 
     def __init__(self, config, dim, drop_path=0.0, **kwargs):
-        # (sayakpaul): need to figure out the layer names.
         super().__init__(**kwargs)
+        base_name = kwargs.get("name")
         self.dwconv = tf.keras.layers.Conv2D(
             filters=dim,
             kernel_size=7,
@@ -114,27 +116,38 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs):
             groups=dim,
             kernel_initializer=get_initializer(config.initializer_range),
             bias_initializer="zeros",
+            name=f"{base_name}.dwconv",
         )  # depthwise conv
-        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name=f"{base_name}.layernorm")
         self.pwconv1 = tf.keras.layers.Dense(
             units=4 * dim,
             kernel_initializer=get_initializer(config.initializer_range),
             bias_initializer="zeros",
+            name=f"{base_name}.pwconv1",
         )  # pointwise/1x1 convs, implemented with linear layers
         self.act = get_tf_activation(config.hidden_act)
         self.pwconv2 = tf.keras.layers.Dense(
             units=dim,
             kernel_initializer=get_initializer(config.initializer_range),
             bias_initializer="zeros",
+            name=f"{base_name}.pwconv2",
         )
         self.layer_scale_parameter = (
-            tf.Variable(config.layer_scale_init_value * tf.ones((dim,)), trainable=True, name="layer_scale_parameter")
+            tf.Variable(
+                config.layer_scale_init_value * tf.ones((dim,)),
+                trainable=True,
+                name=f"{base_name}.layer_scale_parameter",
+            )
             if config.layer_scale_init_value > 0
             else None
         )
         # Using `layers.Activation` instead of `tf.identity` to better control `training`
         # behaviour.
-        self.drop_path = TFConvNextDropPath(drop_path) if drop_path > 0.0 else tf.keras.layers.Activation("linear")
+        self.drop_path = (
+            TFConvNextDropPath(drop_path, name=f"{base_name}.drop_path")
+            if drop_path > 0.0
+            else tf.keras.layers.Activation("linear", name=f"{base_name}.drop_path")
+        )
 
     def call(self, hidden_states, training=False):
         input = hidden_states
@@ -167,25 +180,37 @@ def __init__(
     ):
         # (sayakpaul): need to figure out the names.
         super().__init__(**kwargs)
-
+        base_name = kwargs.get("name")
         if in_channels != out_channels or stride > 1:
             self.downsampling_layer = tf.keras.Sequential(
                 [
-                    tf.keras.layers.LayerNormalization(epsilon=1e-6),
+                    tf.keras.layers.LayerNormalization(
+                        epsilon=1e-6,
+                        name=f"{base_name}.downsampling_layer.0",
+                    ),
                     tf.keras.layers.Conv2D(
                         filters=out_channels,
                         kernel_size=kernel_size,
                         strides=stride,
                         kernel_initializer=get_initializer(config.initializer_range),
                         bias_initializer="zeros",
+                        name=f"{base_name}.downsampling_layer.1",
                     ),
-                ]
+                ],
             )
         else:
-            self.downsampling_layer = tf.identity
+            self.downsampling_layer = tf.keras.layers.Activation("linear")
+
         drop_path_rates = drop_path_rates or [0.0] * depth
         self.layers = tf.keras.Sequential(
-            [*[TFConvNextLayer(config, dim=out_channels, drop_path=drop_path_rates[j]) for j in range(depth)]]
+            [
+                *[
+                    TFConvNextLayer(
+                        config, dim=out_channels, drop_path=drop_path_rates[j], name=f"{base_name}.layers.{j}"
+                    )
+                    for j in range(depth)
+                ]
+            ],
         )
 
     def call(self, hidden_states):
@@ -199,6 +224,7 @@ def __init__(self, config, **kwargs):
         # (sayakpaul): need to figure out the naming convention for `dwconv`,
         # `pwconv1`, and `pwconv2`.
         super().__init__(**kwargs)
+        base_name = kwargs.get("name")
         self.stages = []
         drop_path_rates = [x for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))]
         cur = 0
@@ -212,6 +238,7 @@ def __init__(self, config, **kwargs):
                 stride=2 if i > 0 else 1,
                 depth=config.depths[i],
                 drop_path_rates=drop_path_rates[cur],
+                name=f"{base_name}.stages.{i}",
             )
             self.stages.append(stage)
             cur += config.depths[i]
@@ -329,13 +356,16 @@ def serving(self, inputs):
 class TFConvNextModel(TFConvNextPreTrainedModel):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
+        base_name = kwargs.get("name")
         self.config = config
 
-        self.embeddings = TFConvNextEmbeddings(config)
-        self.encoder = TFConvNextEncoder(config)
+        # Observe the name parameter in `encoder`, `embeddings`, and `layernorm`
+        # Adding `base_name` to the embeddings and layernorm adds errors.
+        self.embeddings = TFConvNextEmbeddings(config, name="embeddings")
+        self.encoder = TFConvNextEncoder(config, name=f"{base_name}.encoder")
 
         # final layernorm layer
-        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps)
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
 
         # global average pooling
         self.pooler = tf.keras.layers.GlobalAvgPool2D()
@@ -422,7 +452,7 @@ def __init__(self, config: ConvNextConfig, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
         self.num_labels = config.num_labels
-        self.convnext = TFConvNextModel(config)
+        self.convnext = TFConvNextModel(config, name="convnext")
 
         # Classifier head
         self.classifier = tf.keras.layers.Dense(

From ee62db49f859db2300c73472ef74db290f582a92 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:08:14 +0530
Subject: [PATCH 14/65] chore: removed unneeded comment.

---
 playground.py                                 | 38 +++++++++++++++++++
 .../models/convnext/modeling_tf_convnext.py   |  2 -
 2 files changed, 38 insertions(+), 2 deletions(-)
 create mode 100644 playground.py

diff --git a/playground.py b/playground.py
new file mode 100644
index 0000000000000..40e83b0c9d107
--- /dev/null
+++ b/playground.py
@@ -0,0 +1,38 @@
+import tensorflow as tf
+from transformers import AutoFeatureExtractor
+
+# import your TFConvNextForImageClassification class here, we will take care
+# of adding the boilerplate to run `from transformers import
+# TFConvNextForImageClassification` later
+from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
+from transformers import ConvNextForImageClassification
+
+from PIL import Image
+
+# model = ConvNextForImageClassification.from_pretrained(
+#     "facebook/convnext-tiny-224",
+# )
+# print(f"Model State Dict:\n")
+# all_keys = list(model.state_dict().keys())
+# print([k for k in all_keys if "layer_scale" in k])
+
+model = TFConvNextForImageClassification.from_pretrained(
+    "facebook/convnext-tiny-224",
+    from_pt=True,
+)  # notice the `from_pt` argument
+print(model.summary(expand_nested=True))
+
+
+# feature_extractor = AutoFeatureExtractor.from_pretrained(
+#     "facebook/convnext-tiny-224"
+# )  # don't know if this is supposed to work with TF as well, change this as needed
+
+# image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
+# inputs = feature_extractor(images=image, return_tensors="tf")
+
+# # forward pass
+# outputs = model(**inputs)
+
+# # verify the logits
+# assert outputs.logits.shape == [1, 1000]
+# tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)
diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 8a8946575a26f..75ab13c9662e4 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -221,8 +221,6 @@ def call(self, hidden_states):
 
 class TFConvNextEncoder(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
-        # (sayakpaul): need to figure out the naming convention for `dwconv`,
-        # `pwconv1`, and `pwconv2`.
         super().__init__(**kwargs)
         base_name = kwargs.get("name")
         self.stages = []

From 8c1d6a3e00fd608b0c1761c9fdd9077a0afb7b9a Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:14:29 +0530
Subject: [PATCH 15/65] removed playground.py

---
 playground.py | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 playground.py

diff --git a/playground.py b/playground.py
deleted file mode 100644
index 40e83b0c9d107..0000000000000
--- a/playground.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import tensorflow as tf
-from transformers import AutoFeatureExtractor
-
-# import your TFConvNextForImageClassification class here, we will take care
-# of adding the boilerplate to run `from transformers import
-# TFConvNextForImageClassification` later
-from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
-from transformers import ConvNextForImageClassification
-
-from PIL import Image
-
-# model = ConvNextForImageClassification.from_pretrained(
-#     "facebook/convnext-tiny-224",
-# )
-# print(f"Model State Dict:\n")
-# all_keys = list(model.state_dict().keys())
-# print([k for k in all_keys if "layer_scale" in k])
-
-model = TFConvNextForImageClassification.from_pretrained(
-    "facebook/convnext-tiny-224",
-    from_pt=True,
-)  # notice the `from_pt` argument
-print(model.summary(expand_nested=True))
-
-
-# feature_extractor = AutoFeatureExtractor.from_pretrained(
-#     "facebook/convnext-tiny-224"
-# )  # don't know if this is supposed to work with TF as well, change this as needed
-
-# image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
-# inputs = feature_extractor(images=image, return_tensors="tf")
-
-# # forward pass
-# outputs = model(**inputs)
-
-# # verify the logits
-# assert outputs.logits.shape == [1, 1000]
-# tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From 490adf887b019ca5adbcc15d5b537898b19c54a7 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:18:41 +0530
Subject: [PATCH 16/65] rebasing

---
 playground.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 playground.py

diff --git a/playground.py b/playground.py
new file mode 100644
index 0000000000000..8a53d5babd2be
--- /dev/null
+++ b/playground.py
@@ -0,0 +1,38 @@
+import tensorflow as tf
+from transformers import AutoFeatureExtractor
+
+# import your TFConvNextForImageClassification class here, we will take care
+# of adding the boilerplate to run `from transformers import
+# TFConvNextForImageClassification` later
+from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
+from transformers import ConvNextForImageClassification
+
+from PIL import Image
+
+# model = ConvNextForImageClassification.from_pretrained(
+#     "facebook/convnext-tiny-224",
+# )
+# print(f"Model State Dict:\n")
+# all_keys = list(model.state_dict().keys())
+# print([k for k in all_keys if "layer_scale" in k])
+
+model = TFConvNextForImageClassification.from_pretrained(
+    "facebook/convnext-tiny-224",
+    from_pt=True,
+)  # notice the `from_pt` argument
+print(model.summary(expand_nested=True))
+
+
+feature_extractor = AutoFeatureExtractor.from_pretrained(
+    "facebook/convnext-tiny-224"
+)  # don't know if this is supposed to work with TF as well, change this as needed
+
+image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
+inputs = feature_extractor(images=image, return_tensors="tf")
+
+# forward pass
+outputs = model(**inputs)
+
+# verify the logits
+assert outputs.logits.shape == [1, 1000]
+tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From fa494693ca9e3dd8a693440c9c8c7c4ac411f686 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:19:23 +0530
Subject: [PATCH 17/65] rebasing and removing playground.py.

---
 playground.py | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 playground.py

diff --git a/playground.py b/playground.py
deleted file mode 100644
index 8a53d5babd2be..0000000000000
--- a/playground.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import tensorflow as tf
-from transformers import AutoFeatureExtractor
-
-# import your TFConvNextForImageClassification class here, we will take care
-# of adding the boilerplate to run `from transformers import
-# TFConvNextForImageClassification` later
-from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
-from transformers import ConvNextForImageClassification
-
-from PIL import Image
-
-# model = ConvNextForImageClassification.from_pretrained(
-#     "facebook/convnext-tiny-224",
-# )
-# print(f"Model State Dict:\n")
-# all_keys = list(model.state_dict().keys())
-# print([k for k in all_keys if "layer_scale" in k])
-
-model = TFConvNextForImageClassification.from_pretrained(
-    "facebook/convnext-tiny-224",
-    from_pt=True,
-)  # notice the `from_pt` argument
-print(model.summary(expand_nested=True))
-
-
-feature_extractor = AutoFeatureExtractor.from_pretrained(
-    "facebook/convnext-tiny-224"
-)  # don't know if this is supposed to work with TF as well, change this as needed
-
-image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
-inputs = feature_extractor(images=image, return_tensors="tf")
-
-# forward pass
-outputs = model(**inputs)
-
-# verify the logits
-assert outputs.logits.shape == [1, 1000]
-tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From acb6fa006a9596348f06da5247a101154c8bc3c5 Mon Sep 17 00:00:00 2001
From: ariG23498 <aritra.born2fly@gmail.com>
Date: Mon, 14 Feb 2022 13:41:43 +0530
Subject: [PATCH 18/65] fix: renaming TFConvNextStage conv and layer norm
 layers

---
 src/transformers/models/convnext/modeling_tf_convnext.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 75ab13c9662e4..dd90d472633a1 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -186,7 +186,7 @@ def __init__(
                 [
                     tf.keras.layers.LayerNormalization(
                         epsilon=1e-6,
-                        name=f"{base_name}.downsampling_layer.0",
+                        name=f"{base_name}/{base_name}.downsampling_layer.0",
                     ),
                     tf.keras.layers.Conv2D(
                         filters=out_channels,
@@ -194,12 +194,12 @@ def __init__(
                         strides=stride,
                         kernel_initializer=get_initializer(config.initializer_range),
                         bias_initializer="zeros",
-                        name=f"{base_name}.downsampling_layer.1",
+                        name=f"{base_name}/{base_name}.downsampling_layer.1",
                     ),
                 ],
             )
         else:
-            self.downsampling_layer = tf.keras.layers.Activation("linear")
+            self.downsampling_layer = tf.identity
 
         drop_path_rates = drop_path_rates or [0.0] * depth
         self.layers = tf.keras.Sequential(

From 8d56711c4ad2787eb0ce6f9f5d151a6c25f18626 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 14 Feb 2022 15:20:41 +0530
Subject: [PATCH 19/65] chore: added initializers and other minor additions.

---
 docs/source/index.mdx                         |  2 +-
 docs/source/model_doc/convnext.mdx            | 17 ++++++++-
 playground.py                                 | 38 +++++++++++++++++++
 src/transformers/__init__.py                  |  4 ++
 .../models/auto/modeling_tf_auto.py           |  2 +
 src/transformers/models/convnext/__init__.py  |  8 +++-
 src/transformers/utils/dummy_tf_objects.py    | 21 ++++++++++
 tests/test_modeling_tf_common.py              |  8 +++-
 8 files changed, 95 insertions(+), 5 deletions(-)
 create mode 100644 playground.py

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 9ee4377110cd8..37f3efb7e2b85 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -177,7 +177,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           Canine            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|          ConvNext           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ConvNext           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/model_doc/convnext.mdx b/docs/source/model_doc/convnext.mdx
index e3a04d371e64c..c2323402beabf 100644
--- a/docs/source/model_doc/convnext.mdx
+++ b/docs/source/model_doc/convnext.mdx
@@ -37,7 +37,8 @@ alt="drawing" width="600"/>
 
 <small> ConvNeXT architecture. Taken from the <a href="https://arxiv.org/abs/2201.03545">original paper</a>.</small>
 
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt).
+This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [ariG23498](https://github.com/ariG23498)
+and [sayakpaul](https://github.com/sayakpaul). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt).
 
 ## ConvNeXT specific outputs
 
@@ -63,4 +64,16 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi
 ## ConvNextForImageClassification
 
 [[autodoc]] ConvNextForImageClassification
-    - forward
\ No newline at end of file
+    - forward
+
+
+## TFConvNextModel
+
+[[autodoc]] TFConvNextModel
+    - call
+
+
+## TFConvNextForImageClassification
+
+[[autodoc]] TFConvNextForImageClassification
+    - call
\ No newline at end of file
diff --git a/playground.py b/playground.py
new file mode 100644
index 0000000000000..8a53d5babd2be
--- /dev/null
+++ b/playground.py
@@ -0,0 +1,38 @@
+import tensorflow as tf
+from transformers import AutoFeatureExtractor
+
+# import your TFConvNextForImageClassification class here, we will take care
+# of adding the boilerplate to run `from transformers import
+# TFConvNextForImageClassification` later
+from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
+from transformers import ConvNextForImageClassification
+
+from PIL import Image
+
+# model = ConvNextForImageClassification.from_pretrained(
+#     "facebook/convnext-tiny-224",
+# )
+# print(f"Model State Dict:\n")
+# all_keys = list(model.state_dict().keys())
+# print([k for k in all_keys if "layer_scale" in k])
+
+model = TFConvNextForImageClassification.from_pretrained(
+    "facebook/convnext-tiny-224",
+    from_pt=True,
+)  # notice the `from_pt` argument
+print(model.summary(expand_nested=True))
+
+
+feature_extractor = AutoFeatureExtractor.from_pretrained(
+    "facebook/convnext-tiny-224"
+)  # don't know if this is supposed to work with TF as well, change this as needed
+
+image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
+inputs = feature_extractor(images=image, return_tensors="tf")
+
+# forward pass
+outputs = model(**inputs)
+
+# verify the logits
+assert outputs.logits.shape == [1, 1000]
+tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index f4b0e2908b61d..c9e6feec10fcb 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -852,6 +852,9 @@
             "ConvNextForImageClassification",
             "ConvNextModel",
             "ConvNextPreTrainedModel",
+            "TFConvNextForImageClassification",
+            "TFConvNextModel",
+            "TFConvNextPreTrainedModel",
         ]
     )
     _import_structure["models.ctrl"].extend(
@@ -3680,6 +3683,7 @@
             TFConvBertModel,
             TFConvBertPreTrainedModel,
         )
+        from .models.convnext import TFConvNextForImageClassification, TFConvNextModel, TFConvNextPreTrainedModel
         from .models.ctrl import (
             TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFCTRLForSequenceClassification,
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index cd4158bc7dd46..1b95cfa01d545 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -36,6 +36,7 @@
         ("rembert", "TFRemBertModel"),
         ("roformer", "TFRoFormerModel"),
         ("convbert", "TFConvBertModel"),
+        ("convnext", "TFConvNextModel"),
         ("led", "TFLEDModel"),
         ("lxmert", "TFLxmertModel"),
         ("mt5", "TFMT5Model"),
@@ -155,6 +156,7 @@
     [
         # Model for Image-classsification
         ("vit", "TFViTForImageClassification"),
+        ("convnext", "TFConvNextForImageClassification"),
     ]
 )
 
diff --git a/src/transformers/models/convnext/__init__.py b/src/transformers/models/convnext/__init__.py
index cdc064d3c994a..995d38f80998d 100644
--- a/src/transformers/models/convnext/__init__.py
+++ b/src/transformers/models/convnext/__init__.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...file_utils import _LazyModule, is_torch_available, is_vision_available
+from ...file_utils import _LazyModule, is_torch_available, is_tf_available, is_vision_available
 
 
 _import_structure = {
@@ -36,6 +36,12 @@
         "ConvNextPreTrainedModel",
     ]
 
+if is_tf_available():
+    _import_structure["modeling_tf_convnext"] = [
+        "TFConvNextForImageClassification",
+        "TFConvNextModel",
+        "TFConvNextPreTrainedModel",
+    ]
 
 if TYPE_CHECKING:
     from .configuration_convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 02b401ef394ec..d70937fe19d0c 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -599,6 +599,27 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFConvNextForImageClassification(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFConvNextModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFConvNextPreTrainedModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 95c953a6e3aec..6b442c1ffc1d9 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -475,7 +475,13 @@ def test_compile_tf_model(self):
                     "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
                 }
             # TODO: A better way to handle vision models
-            elif model_class.__name__ in ["TFViTModel", "TFViTForImageClassification", "TFCLIPVisionModel"]:
+            elif model_class.__name__ in [
+                "TFConvNextModel",
+                "TFConvNextForImageClassification",
+                "TFViTModel",
+                "TFViTForImageClassification",
+                "TFCLIPVisionModel",
+            ]:
                 inputs = tf.keras.Input(
                     batch_shape=(
                         3,

From 11b0683dc5024ede739d1a7c57b1e39b1843976f Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 14 Feb 2022 15:21:06 +0530
Subject: [PATCH 20/65] chore: added initializers and other minor additions.

---
 playground.py | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 playground.py

diff --git a/playground.py b/playground.py
deleted file mode 100644
index 8a53d5babd2be..0000000000000
--- a/playground.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import tensorflow as tf
-from transformers import AutoFeatureExtractor
-
-# import your TFConvNextForImageClassification class here, we will take care
-# of adding the boilerplate to run `from transformers import
-# TFConvNextForImageClassification` later
-from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
-from transformers import ConvNextForImageClassification
-
-from PIL import Image
-
-# model = ConvNextForImageClassification.from_pretrained(
-#     "facebook/convnext-tiny-224",
-# )
-# print(f"Model State Dict:\n")
-# all_keys = list(model.state_dict().keys())
-# print([k for k in all_keys if "layer_scale" in k])
-
-model = TFConvNextForImageClassification.from_pretrained(
-    "facebook/convnext-tiny-224",
-    from_pt=True,
-)  # notice the `from_pt` argument
-print(model.summary(expand_nested=True))
-
-
-feature_extractor = AutoFeatureExtractor.from_pretrained(
-    "facebook/convnext-tiny-224"
-)  # don't know if this is supposed to work with TF as well, change this as needed
-
-image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
-inputs = feature_extractor(images=image, return_tensors="tf")
-
-# forward pass
-outputs = model(**inputs)
-
-# verify the logits
-assert outputs.logits.shape == [1, 1000]
-tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From fd0ca7fa8e28f3477f206ce0b223ab2e9f00ae94 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 14 Feb 2022 18:08:04 +0530
Subject: [PATCH 21/65] add: tests for convnext.

---
 .../models/convnext/modeling_tf_convnext.py   |   3 +
 tests/test_modeling_tf_convnext.py            | 243 ++++++++++++++++++
 2 files changed, 246 insertions(+)
 create mode 100644 tests/test_modeling_tf_convnext.py

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index dd90d472633a1..2d7d24860f0fb 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -84,6 +84,9 @@ def call(self, pixel_values):
         # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format.
         # So change the input format from `NCHW` to `NHWC`.
         # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        if isinstance(pixel_values, dict):
+            pixel_values = pixel_values["pixel_values"]
+
         pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
 
         embeddings = self.patch_embeddings(pixel_values)
diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py
new file mode 100644
index 0000000000000..b2bcc980ae946
--- /dev/null
+++ b/tests/test_modeling_tf_convnext.py
@@ -0,0 +1,243 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TensorFlow ConvNext model. """
+
+import unittest
+import inspect
+
+from transformers import ConvNextConfig
+from transformers.file_utils import cached_property, is_tf_available, is_vision_available
+from transformers.testing_utils import require_tf, require_vision, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import TFConvNextForImageClassification, TFConvNextModel
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ConvNextFeatureExtractor
+
+
+class TFConvNextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=32,
+        num_channels=3,
+        num_stages=4,
+        hidden_sizes=[10, 20, 30, 40],
+        depths=[2, 2, 3, 2],
+        is_training=True,
+        use_labels=True,
+        intermediate_size=37,
+        hidden_act="gelu",
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        num_labels=3,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.num_stages = num_stages
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return ConvNextConfig(
+            num_channels=self.num_channels,
+            hidden_sizes=self.hidden_sizes,
+            depths=self.depths,
+            num_stages=self.num_stages,
+            hidden_act=self.hidden_act,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = TFConvNextModel(config=config)
+        result = model(pixel_values, training=False)
+        # expected last hidden states: B, H // 32, W // 32, C
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.image_size // 32, self.image_size // 32, self.hidden_sizes[-1]),
+        )
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = TFConvNextForImageClassification(config)
+        result = model(pixel_values, labels=labels, training=False)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_tf
+class TFConvNextModelTest(TFModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as ConvNext does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (
+        (
+            TFConvNextModel,
+            TFConvNextForImageClassification,
+        )
+        if is_tf_available()
+        else ()
+    )
+
+    test_pruning = False
+    test_onnx = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = TFConvNextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ConvNextConfig, has_text_modality=False, hidden_size=37)
+
+    @unittest.skip(reason="ConvNext does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="ConvNext does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Model doesn't have attention layers")
+    def test_attention_outputs(self):
+        pass
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_stages = self.model_tester.num_stages
+            self.assertEqual(len(hidden_states), expected_num_stages + 1)
+
+            # ConvNext's feature maps are of shape (batch_size, height, width, num_channels) in TF
+            self.assertListEqual(
+                list(hidden_states[0].shape[1:-1]),
+                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = TFConvNextModel.from_pretrained("facebook/convnext-tiny-224")
+        self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_tf
+@require_vision
+class TFViTModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return ConvNextFeatureExtractor.from_pretrained("acebook/convnext-tiny-224") if is_vision_available() else None
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = TFConvNextForImageClassification.from_pretrained(
+            "acebook/convnext-tiny-224",
+            from_pt=True,
+        )
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        inputs = feature_extractor(images=image, return_tensors="tf")
+
+        # forward pass
+        outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = tf.TensorShape((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = tf.constant([-0.0260, -0.4739, 0.1911])
+
+        tf.debugging.assert_near(outputs.logits[0, :3], expected_slice, atol=1e-4)

From 98911a249fa29a40f44d7cc25392e0f25c7a4c36 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 14 Feb 2022 20:42:05 +0530
Subject: [PATCH 22/65] fix: integration tester class.

---
 tests/test_modeling_tf_convnext.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py
index b2bcc980ae946..d70d9d35d8d57 100644
--- a/tests/test_modeling_tf_convnext.py
+++ b/tests/test_modeling_tf_convnext.py
@@ -215,7 +215,7 @@ def prepare_img():
 
 @require_tf
 @require_vision
-class TFViTModelIntegrationTest(unittest.TestCase):
+class TFConvNextModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
         return ConvNextFeatureExtractor.from_pretrained("acebook/convnext-tiny-224") if is_vision_available() else None

From b30a8ccb4ccb75ed914c2af982d102a023bda4de Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 16 Feb 2022 07:09:00 +0530
Subject: [PATCH 23/65] fix: issues mentioned in pr feedback (round 1).

---
 src/transformers/modeling_tf_utils.py                  |  3 ---
 .../models/convnext/modeling_tf_convnext.py            |  1 -
 tests/test_modeling_tf_common.py                       | 10 ++--------
 tests/test_modeling_tf_convnext.py                     |  6 ++++--
 4 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 1d62180f29f5e..f5249a8e76b07 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -309,9 +309,6 @@ def booleans_processing(config, **kwargs):
     final_booleans = {}
 
     if tf.executing_eagerly():
-        # final_booleans["output_attentions"] = (
-        #     kwargs["output_attentions"] if kwargs["output_attentions"] else config.output_attentions
-        # )
         final_booleans["output_attentions"] = kwargs.get("output_attentions", None)
         if not final_booleans["output_attentions"]:
             final_booleans["output_attentions"] = config.output_attentions
diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 2d7d24860f0fb..d355a5663709f 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -181,7 +181,6 @@ class TFConvNextStage(tf.keras.layers.Layer):
     def __init__(
         self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs
     ):
-        # (sayakpaul): need to figure out the names.
         super().__init__(**kwargs)
         base_name = kwargs.get("name")
         if in_channels != out_channels or stride > 1:
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 6b442c1ffc1d9..6fe6ea3b52e16 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -474,14 +474,8 @@ def test_compile_tf_model(self):
                     ),
                     "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
                 }
-            # TODO: A better way to handle vision models
-            elif model_class.__name__ in [
-                "TFConvNextModel",
-                "TFConvNextForImageClassification",
-                "TFViTModel",
-                "TFViTForImageClassification",
-                "TFCLIPVisionModel",
-            ]:
+            # `pixel_values` implies that the input is an image
+            elif model_class.main_input_name == "pixel_values":  
                 inputs = tf.keras.Input(
                     batch_shape=(
                         3,
diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py
index d70d9d35d8d57..c47a895453b87 100644
--- a/tests/test_modeling_tf_convnext.py
+++ b/tests/test_modeling_tf_convnext.py
@@ -218,12 +218,14 @@ def prepare_img():
 class TFConvNextModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
-        return ConvNextFeatureExtractor.from_pretrained("acebook/convnext-tiny-224") if is_vision_available() else None
+        return (
+            ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
+        )
 
     @slow
     def test_inference_image_classification_head(self):
         model = TFConvNextForImageClassification.from_pretrained(
-            "acebook/convnext-tiny-224",
+            "facebook/convnext-tiny-224",
             from_pt=True,
         )
 

From 2181d5b2fddd23ec214c6601a2beabbbcdd75dc8 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 16 Feb 2022 07:58:49 +0530
Subject: [PATCH 24/65] fix: how output_hidden_states arg is propoagated inside
 the network.

---
 src/transformers/models/convnext/modeling_tf_convnext.py | 3 +++
 tests/test_modeling_tf_convnext.py                       | 1 -
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index d355a5663709f..7b899723c1589 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -400,6 +400,9 @@ def call(
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
         ```"""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         inputs = input_processing(
diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py
index c47a895453b87..2bb05b1aed953 100644
--- a/tests/test_modeling_tf_convnext.py
+++ b/tests/test_modeling_tf_convnext.py
@@ -173,7 +173,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
 
             outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
             hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
 
             expected_num_stages = self.model_tester.num_stages

From cc98979f3177394c3f0e4b6c41093fcbce07bec5 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 16 Feb 2022 08:31:18 +0530
Subject: [PATCH 25/65] feat: handling of  arg for pure cnn models.

---
 tests/test_modeling_tf_common.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 6fe6ea3b52e16..822656a3afbff 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -475,7 +475,7 @@ def test_compile_tf_model(self):
                     "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
                 }
             # `pixel_values` implies that the input is an image
-            elif model_class.main_input_name == "pixel_values":  
+            elif model_class.main_input_name == "pixel_values":
                 inputs = tf.keras.Input(
                     batch_shape=(
                         3,
@@ -799,23 +799,27 @@ def recursive_check(tuple_object, dict_object):
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
             check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
 
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+            # Pure conv models (such as ConvNeXt) don't have `output_attentions`.
+            if config.output_attentions:
+                tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+                dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
 
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
 
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+            if config.output_attentions:
+                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
 
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(
-                model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
-            )
+            if config.output_attentions:
+                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                check_equivalence(
+                    model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
+                )
 
     def test_inputs_embeds(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

From 12e4505d5d4d259eba51fccc9bd0c5ad69b2755d Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 16 Feb 2022 08:34:05 +0530
Subject: [PATCH 26/65] chore: added a note on equal contribution in model
 docs.

---
 docs/source/model_doc/convnext.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/model_doc/convnext.mdx b/docs/source/model_doc/convnext.mdx
index c2323402beabf..f2e789b36916a 100644
--- a/docs/source/model_doc/convnext.mdx
+++ b/docs/source/model_doc/convnext.mdx
@@ -38,7 +38,7 @@ alt="drawing" width="600"/>
 <small> ConvNeXT architecture. Taken from the <a href="https://arxiv.org/abs/2201.03545">original paper</a>.</small>
 
 This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [ariG23498](https://github.com/ariG23498)
-and [sayakpaul](https://github.com/sayakpaul). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt).
+and [sayakpaul](https://github.com/sayakpaul) (equal contribution). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt).
 
 ## ConvNeXT specific outputs
 

From eb493386372ce42939e71c29083574049e3aa7a9 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:18:41 +0530
Subject: [PATCH 27/65] rebasing

---
 playground.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 playground.py

diff --git a/playground.py b/playground.py
new file mode 100644
index 0000000000000..8a53d5babd2be
--- /dev/null
+++ b/playground.py
@@ -0,0 +1,38 @@
+import tensorflow as tf
+from transformers import AutoFeatureExtractor
+
+# import your TFConvNextForImageClassification class here, we will take care
+# of adding the boilerplate to run `from transformers import
+# TFConvNextForImageClassification` later
+from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
+from transformers import ConvNextForImageClassification
+
+from PIL import Image
+
+# model = ConvNextForImageClassification.from_pretrained(
+#     "facebook/convnext-tiny-224",
+# )
+# print(f"Model State Dict:\n")
+# all_keys = list(model.state_dict().keys())
+# print([k for k in all_keys if "layer_scale" in k])
+
+model = TFConvNextForImageClassification.from_pretrained(
+    "facebook/convnext-tiny-224",
+    from_pt=True,
+)  # notice the `from_pt` argument
+print(model.summary(expand_nested=True))
+
+
+feature_extractor = AutoFeatureExtractor.from_pretrained(
+    "facebook/convnext-tiny-224"
+)  # don't know if this is supposed to work with TF as well, change this as needed
+
+image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
+inputs = feature_extractor(images=image, return_tensors="tf")
+
+# forward pass
+outputs = model(**inputs)
+
+# verify the logits
+assert outputs.logits.shape == [1, 1000]
+tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From 5e01b71b1543b066491ecc809124ff34cfde9e8a Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:19:23 +0530
Subject: [PATCH 28/65] rebasing and removing playground.py.

---
 playground.py | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 playground.py

diff --git a/playground.py b/playground.py
deleted file mode 100644
index 8a53d5babd2be..0000000000000
--- a/playground.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import tensorflow as tf
-from transformers import AutoFeatureExtractor
-
-# import your TFConvNextForImageClassification class here, we will take care
-# of adding the boilerplate to run `from transformers import
-# TFConvNextForImageClassification` later
-from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
-from transformers import ConvNextForImageClassification
-
-from PIL import Image
-
-# model = ConvNextForImageClassification.from_pretrained(
-#     "facebook/convnext-tiny-224",
-# )
-# print(f"Model State Dict:\n")
-# all_keys = list(model.state_dict().keys())
-# print([k for k in all_keys if "layer_scale" in k])
-
-model = TFConvNextForImageClassification.from_pretrained(
-    "facebook/convnext-tiny-224",
-    from_pt=True,
-)  # notice the `from_pt` argument
-print(model.summary(expand_nested=True))
-
-
-feature_extractor = AutoFeatureExtractor.from_pretrained(
-    "facebook/convnext-tiny-224"
-)  # don't know if this is supposed to work with TF as well, change this as needed
-
-image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
-inputs = feature_extractor(images=image, return_tensors="tf")
-
-# forward pass
-outputs = model(**inputs)
-
-# verify the logits
-assert outputs.logits.shape == [1, 1000]
-tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From 908d0cf85b6c40c9de7c0dd6c77c65287472c7f2 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 17 Feb 2022 09:49:25 +0530
Subject: [PATCH 29/65] feat: encapsulation for the convnext trunk.

---
 .../models/convnext/modeling_tf_convnext.py   | 176 +++++++++++++-----
 1 file changed, 131 insertions(+), 45 deletions(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 7b899723c1589..1a37948647a70 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -29,6 +29,7 @@
     TFSequenceClassificationLoss,
     get_initializer,
     input_processing,
+    keras_serializable,
 )
 from ...utils import logging
 from .configuration_convnext import ConvNextConfig
@@ -81,12 +82,12 @@ def __init__(self, config, **kwargs):
         self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
 
     def call(self, pixel_values):
-        # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format.
-        # So change the input format from `NCHW` to `NHWC`.
-        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
         if isinstance(pixel_values, dict):
             pixel_values = pixel_values["pixel_values"]
 
+        # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format.
+        # So change the input format from `NCHW` to `NHWC`.
+        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
         pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
 
         embeddings = self.patch_embeddings(pixel_values)
@@ -119,27 +120,35 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs):
             groups=dim,
             kernel_initializer=get_initializer(config.initializer_range),
             bias_initializer="zeros",
-            name=f"{base_name}.dwconv",
+            name="dwconv",
+            # name=f"{base_name}.dwconv",
         )  # depthwise conv
-        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name=f"{base_name}.layernorm")
+        self.layernorm = tf.keras.layers.LayerNormalization(
+            epsilon=1e-6,
+            name="layernorm",
+            # name=f"{base_name}.layernorm"
+        )
         self.pwconv1 = tf.keras.layers.Dense(
             units=4 * dim,
             kernel_initializer=get_initializer(config.initializer_range),
             bias_initializer="zeros",
-            name=f"{base_name}.pwconv1",
+            name="pwconv1",
+            # name=f"{base_name}.pwconv1",
         )  # pointwise/1x1 convs, implemented with linear layers
         self.act = get_tf_activation(config.hidden_act)
         self.pwconv2 = tf.keras.layers.Dense(
             units=dim,
             kernel_initializer=get_initializer(config.initializer_range),
             bias_initializer="zeros",
-            name=f"{base_name}.pwconv2",
+            name="pwconv2",
+            # name=f"{base_name}.pwconv2",
         )
         self.layer_scale_parameter = (
             tf.Variable(
                 config.layer_scale_init_value * tf.ones((dim,)),
                 trainable=True,
-                name=f"{base_name}.layer_scale_parameter",
+                name="layer_scale_parameter",
+                # name=f"{base_name}.layer_scale_parameter",
             )
             if config.layer_scale_init_value > 0
             else None
@@ -147,9 +156,17 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs):
         # Using `layers.Activation` instead of `tf.identity` to better control `training`
         # behaviour.
         self.drop_path = (
-            TFConvNextDropPath(drop_path, name=f"{base_name}.drop_path")
+            TFConvNextDropPath(
+                drop_path,
+                name="drop_path",
+                # name=f"{base_name}.drop_path"
+            )
             if drop_path > 0.0
-            else tf.keras.layers.Activation("linear", name=f"{base_name}.drop_path")
+            else tf.keras.layers.Activation(
+                "linear",
+                name="drop_path",
+                # name=f"{base_name}.drop_path"
+            )
         )
 
     def call(self, hidden_states, training=False):
@@ -188,7 +205,8 @@ def __init__(
                 [
                     tf.keras.layers.LayerNormalization(
                         epsilon=1e-6,
-                        name=f"{base_name}/{base_name}.downsampling_layer.0",
+                        name="downsampling_layer.0",
+                        # name=f"{base_name}/{base_name}.downsampling_layer.0",
                     ),
                     tf.keras.layers.Conv2D(
                         filters=out_channels,
@@ -196,7 +214,8 @@ def __init__(
                         strides=stride,
                         kernel_initializer=get_initializer(config.initializer_range),
                         bias_initializer="zeros",
-                        name=f"{base_name}/{base_name}.downsampling_layer.1",
+                        name="downsampling_layer.1",
+                        # name=f"{base_name}/{base_name}.downsampling_layer.1",
                     ),
                 ],
             )
@@ -208,7 +227,11 @@ def __init__(
             [
                 *[
                     TFConvNextLayer(
-                        config, dim=out_channels, drop_path=drop_path_rates[j], name=f"{base_name}.layers.{j}"
+                        config,
+                        dim=out_channels,
+                        drop_path=drop_path_rates[j],
+                        name=f"layers.{j}",
+                        # name=f"{base_name}.layers.{j}"
                     )
                     for j in range(depth)
                 ]
@@ -238,7 +261,8 @@ def __init__(self, config, **kwargs):
                 stride=2 if i > 0 else 1,
                 depth=config.depths[i],
                 drop_path_rates=drop_path_rates[cur],
-                name=f"{base_name}.stages.{i}",
+                name=f"stages.{i}",
+                # name=f"{base_name}.stages.{i}",
             )
             self.stages.append(stage)
             cur += config.depths[i]
@@ -265,6 +289,73 @@ def call(self, hidden_states, output_hidden_states=False, return_dict=True):
         )
 
 
+@keras_serializable
+class TFConvNextMainLayer(tf.keras.layers.Layer):
+    config_class = ConvNextConfig
+
+    def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        base_name = kwargs.get("name")
+        self.embeddings = TFConvNextEmbeddings(config, name="embeddings")
+        self.encoder = TFConvNextEncoder(config, name="encoder")
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+        self.pooler = tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None
+
+    def call(
+        self,
+        pixel_values: Optional[TFModelInputType] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        **kwargs,
+    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        if "input_ids" in inputs:
+            inputs["pixel_values"] = inputs.pop("input_ids")
+
+        if inputs["pixel_values"] is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(inputs["pixel_values"], training=inputs["training"])
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=inputs["training"],
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        # print(f"From modeling TF: {type(self.pooler), type(self.layernorm)}")
+        # print(f"From modeling TF: {last_hidden_state}")
+        pooled_output = self.layernorm(self.pooler(last_hidden_state))
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
 class TFConvNextPreTrainedModel(TFPreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -354,21 +445,21 @@ def serving(self, inputs):
     CONVNEXT_START_DOCSTRING,
 )
 class TFConvNextModel(TFConvNextPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
+    def __init__(self, config, *inputs, add_pooling_layer=True, **kwargs):
         super().__init__(config, *inputs, **kwargs)
-        base_name = kwargs.get("name")
-        self.config = config
+        # base_name = kwargs.get("name")
+        # self.config = config
+        self.convnext = TFConvNextMainLayer(config, add_pooling_layer=add_pooling_layer, name="convnext")
+        # # Observe the name parameter in `encoder`, `embeddings`, and `layernorm`
+        # # Adding `base_name` to the embeddings and layernorm adds errors.
+        # self.embeddings = TFConvNextEmbeddings(config, name="embeddings")
+        # self.encoder = TFConvNextEncoder(config, name=f"{base_name}.encoder")
 
-        # Observe the name parameter in `encoder`, `embeddings`, and `layernorm`
-        # Adding `base_name` to the embeddings and layernorm adds errors.
-        self.embeddings = TFConvNextEmbeddings(config, name="embeddings")
-        self.encoder = TFConvNextEncoder(config, name=f"{base_name}.encoder")
-
-        # final layernorm layer
-        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+        # # final layernorm layer
+        # self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
 
-        # global average pooling
-        self.pooler = tf.keras.layers.GlobalAvgPool2D()
+        # # global average pooling
+        # self.pooler = tf.keras.layers.GlobalAvgPool2D()
 
     @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
@@ -421,26 +512,13 @@ def call(
         if inputs["pixel_values"] is None:
             raise ValueError("You have to specify pixel_values")
 
-        embedding_output = self.embeddings(pixel_values)
-
-        encoder_outputs = self.encoder(
-            embedding_output,
+        outputs = self.convnext(
+            pixel_values=inputs["pixel_values"],
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            training=inputs["training"],
         )
-
-        last_hidden_state = encoder_outputs[0]
-
-        pooled_output = self.layernorm(self.pooler(last_hidden_state))
-
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-        )
+        return outputs
 
 
 @add_start_docstrings(
@@ -455,7 +533,7 @@ def __init__(self, config: ConvNextConfig, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
         self.num_labels = config.num_labels
-        self.convnext = TFConvNextModel(config, name="convnext")
+        self.convnext = TFConvNextMainLayer(config, name="convnext")
 
         # Classifier head
         self.classifier = tf.keras.layers.Dense(
@@ -505,6 +583,11 @@ def call(
         >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
         >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
         ```"""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         inputs = input_processing(
             func=self.call,
             config=self.config,
@@ -523,7 +606,10 @@ def call(
             raise ValueError("You have to specify pixel_values")
 
         outputs = self.convnext(
-            inputs["pixel_values"], output_hidden_states=output_hidden_states, return_dict=return_dict
+            inputs["pixel_values"],
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=inputs["training"],
         )
 
         pooled_output = outputs.pooler_output if return_dict else outputs[1]

From d386cf884237fe7a215901b4e748cc1daf8d18b2 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Fri, 18 Feb 2022 18:13:55 +0000
Subject: [PATCH 30/65] Fix variable naming; Test-related corrections; Run make
 fixup

---
 src/transformers/__init__.py                  |  10 +-
 src/transformers/modeling_tf_utils.py         |   2 +-
 src/transformers/models/convnext/__init__.py  |   5 +-
 .../models/convnext/modeling_tf_convnext.py   | 131 ++++++++----------
 tests/test_modeling_tf_convnext.py            |   6 +-
 5 files changed, 71 insertions(+), 83 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 22eceb14efabd..3224c282cdf85 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -852,9 +852,6 @@
             "ConvNextForImageClassification",
             "ConvNextModel",
             "ConvNextPreTrainedModel",
-            "TFConvNextForImageClassification",
-            "TFConvNextModel",
-            "TFConvNextPreTrainedModel",
         ]
     )
     _import_structure["models.ctrl"].extend(
@@ -1717,6 +1714,13 @@
             "TFConvBertPreTrainedModel",
         ]
     )
+    _import_structure["models.convnext"].extend(
+        [
+            "TFConvNextForImageClassification",
+            "TFConvNextModel",
+            "TFConvNextPreTrainedModel",
+        ]
+    )
     _import_structure["models.ctrl"].extend(
         [
             "TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST",
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 85b1accb605df..8ed0a273a518c 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -1829,7 +1829,7 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optiona
         super().__init__(**kwargs)
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
-        self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
+        self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
 
     def build(self, input_shape):
         """
diff --git a/src/transformers/models/convnext/__init__.py b/src/transformers/models/convnext/__init__.py
index 995d38f80998d..a627c462e9ba4 100644
--- a/src/transformers/models/convnext/__init__.py
+++ b/src/transformers/models/convnext/__init__.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...file_utils import _LazyModule, is_torch_available, is_tf_available, is_vision_available
+from ...file_utils import _LazyModule, is_tf_available, is_torch_available, is_vision_available
 
 
 _import_structure = {
@@ -57,6 +57,9 @@
             ConvNextPreTrainedModel,
         )
 
+    if is_tf_available():
+        from .modeling_convnext import TFConvNextForImageClassification, TFConvNextModel, TFConvNextPreTrainedModel
+
 
 else:
     import sys
diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 1a37948647a70..2dd5f155da94a 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -69,8 +69,6 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer):
 
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
-        # note that we do not use the `base_name` here in `patch_embeddings`
-        # and `layernorm`
         self.patch_embeddings = tf.keras.layers.Conv2D(
             filters=config.hidden_sizes[0],
             kernel_size=config.patch_size,
@@ -101,8 +99,8 @@ class TFConvNextLayer(tf.keras.layers.Layer):
     There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
     H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back
 
-    The authors used (2) as they find it slightly faster in PyTorch. Since we already permuted the inputs to
-    follow NHWC ordering, we can just apply the operations straight-away without the permutation.
+    The authors used (2) as they find it slightly faster in PyTorch. Since we already permuted the inputs to follow
+    NHWC ordering, we can just apply the operations straight-away without the permutation.
 
     Args:
         config ([`ConvNextConfig`]): Model configuration class.
@@ -112,7 +110,8 @@ class TFConvNextLayer(tf.keras.layers.Layer):
 
     def __init__(self, config, dim, drop_path=0.0, **kwargs):
         super().__init__(**kwargs)
-        base_name = kwargs.get("name")
+        self.dim = dim
+        self.config = config
         self.dwconv = tf.keras.layers.Conv2D(
             filters=dim,
             kernel_size=7,
@@ -121,19 +120,16 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs):
             kernel_initializer=get_initializer(config.initializer_range),
             bias_initializer="zeros",
             name="dwconv",
-            # name=f"{base_name}.dwconv",
         )  # depthwise conv
         self.layernorm = tf.keras.layers.LayerNormalization(
             epsilon=1e-6,
             name="layernorm",
-            # name=f"{base_name}.layernorm"
         )
         self.pwconv1 = tf.keras.layers.Dense(
             units=4 * dim,
             kernel_initializer=get_initializer(config.initializer_range),
             bias_initializer="zeros",
             name="pwconv1",
-            # name=f"{base_name}.pwconv1",
         )  # pointwise/1x1 convs, implemented with linear layers
         self.act = get_tf_activation(config.hidden_act)
         self.pwconv2 = tf.keras.layers.Dense(
@@ -141,17 +137,6 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs):
             kernel_initializer=get_initializer(config.initializer_range),
             bias_initializer="zeros",
             name="pwconv2",
-            # name=f"{base_name}.pwconv2",
-        )
-        self.layer_scale_parameter = (
-            tf.Variable(
-                config.layer_scale_init_value * tf.ones((dim,)),
-                trainable=True,
-                name="layer_scale_parameter",
-                # name=f"{base_name}.layer_scale_parameter",
-            )
-            if config.layer_scale_init_value > 0
-            else None
         )
         # Using `layers.Activation` instead of `tf.identity` to better control `training`
         # behaviour.
@@ -159,16 +144,28 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs):
             TFConvNextDropPath(
                 drop_path,
                 name="drop_path",
-                # name=f"{base_name}.drop_path"
             )
             if drop_path > 0.0
             else tf.keras.layers.Activation(
                 "linear",
                 name="drop_path",
-                # name=f"{base_name}.drop_path"
             )
         )
 
+    def build(self, input_shape: tf.TensorShape):
+        # PT's `nn.Parameters` must be mapped to a TF layer weight to inherit the same name hierarchy (and vice-versa)
+        self.layer_scale_parameter = (
+            self.add_weight(
+                shape=(self.dim,),
+                initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_parameter",
+            )
+            if self.config.layer_scale_init_value > 0
+            else None
+        )
+        super().build(input_shape)
+
     def call(self, hidden_states, training=False):
         input = hidden_states
         x = self.dwconv(hidden_states)
@@ -199,55 +196,46 @@ def __init__(
         self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs
     ):
         super().__init__(**kwargs)
-        base_name = kwargs.get("name")
         if in_channels != out_channels or stride > 1:
-            self.downsampling_layer = tf.keras.Sequential(
-                [
-                    tf.keras.layers.LayerNormalization(
-                        epsilon=1e-6,
-                        name="downsampling_layer.0",
-                        # name=f"{base_name}/{base_name}.downsampling_layer.0",
-                    ),
-                    tf.keras.layers.Conv2D(
-                        filters=out_channels,
-                        kernel_size=kernel_size,
-                        strides=stride,
-                        kernel_initializer=get_initializer(config.initializer_range),
-                        bias_initializer="zeros",
-                        name="downsampling_layer.1",
-                        # name=f"{base_name}/{base_name}.downsampling_layer.1",
-                    ),
-                ],
-            )
+            self.downsampling_layer = [
+                tf.keras.layers.LayerNormalization(
+                    epsilon=1e-6,
+                    name="downsampling_layer.0",
+                ),
+                tf.keras.layers.Conv2D(
+                    filters=out_channels,
+                    kernel_size=kernel_size,
+                    strides=stride,
+                    kernel_initializer=get_initializer(config.initializer_range),
+                    bias_initializer="zeros",
+                    name="downsampling_layer.1",
+                ),
+            ]
         else:
-            self.downsampling_layer = tf.identity
+            self.downsampling_layer = [tf.identity]
 
         drop_path_rates = drop_path_rates or [0.0] * depth
-        self.layers = tf.keras.Sequential(
-            [
-                *[
-                    TFConvNextLayer(
-                        config,
-                        dim=out_channels,
-                        drop_path=drop_path_rates[j],
-                        name=f"layers.{j}",
-                        # name=f"{base_name}.layers.{j}"
-                    )
-                    for j in range(depth)
-                ]
-            ],
-        )
+        self.layers = [
+            TFConvNextLayer(
+                config,
+                dim=out_channels,
+                drop_path=drop_path_rates[j],
+                name=f"layers.{j}",
+            )
+            for j in range(depth)
+        ]
 
     def call(self, hidden_states):
-        hidden_states = self.downsampling_layer(hidden_states)
-        hidden_states = self.layers(hidden_states)
+        for layer in self.downsampling_layer:
+            hidden_states = layer(hidden_states)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
         return hidden_states
 
 
 class TFConvNextEncoder(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
-        base_name = kwargs.get("name")
         self.stages = []
         drop_path_rates = [x for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))]
         cur = 0
@@ -262,7 +250,6 @@ def __init__(self, config, **kwargs):
                 depth=config.depths[i],
                 drop_path_rates=drop_path_rates[cur],
                 name=f"stages.{i}",
-                # name=f"{base_name}.stages.{i}",
             )
             self.stages.append(stage)
             cur += config.depths[i]
@@ -297,7 +284,6 @@ def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwa
         super().__init__(**kwargs)
 
         self.config = config
-        base_name = kwargs.get("name")
         self.embeddings = TFConvNextEmbeddings(config, name="embeddings")
         self.encoder = TFConvNextEncoder(config, name="encoder")
         self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
@@ -342,8 +328,6 @@ def call(
         )
 
         last_hidden_state = encoder_outputs[0]
-        # print(f"From modeling TF: {type(self.pooler), type(self.layernorm)}")
-        # print(f"From modeling TF: {last_hidden_state}")
         pooled_output = self.layernorm(self.pooler(last_hidden_state))
 
         if not return_dict:
@@ -447,19 +431,7 @@ def serving(self, inputs):
 class TFConvNextModel(TFConvNextPreTrainedModel):
     def __init__(self, config, *inputs, add_pooling_layer=True, **kwargs):
         super().__init__(config, *inputs, **kwargs)
-        # base_name = kwargs.get("name")
-        # self.config = config
         self.convnext = TFConvNextMainLayer(config, add_pooling_layer=add_pooling_layer, name="convnext")
-        # # Observe the name parameter in `encoder`, `embeddings`, and `layernorm`
-        # # Adding `base_name` to the embeddings and layernorm adds errors.
-        # self.embeddings = TFConvNextEmbeddings(config, name="embeddings")
-        # self.encoder = TFConvNextEncoder(config, name=f"{base_name}.encoder")
-
-        # # final layernorm layer
-        # self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
-
-        # # global average pooling
-        # self.pooler = tf.keras.layers.GlobalAvgPool2D()
 
     @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
@@ -518,7 +490,16 @@ def call(
             return_dict=return_dict,
             training=inputs["training"],
         )
-        return outputs
+
+        # converts back NHWC -> NCHW, to match PT's output
+        if not return_dict:
+            return (tf.transpose(outputs[0], perm=(0, 3, 1, 2)),) + outputs[1:]
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=tf.transpose(outputs.last_hidden_state, perm=(0, 3, 1, 2)),
+            pooler_output=outputs.pooler_output,
+            hidden_states=outputs.hidden_states,
+        )
 
 
 @add_start_docstrings(
diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py
index 2bb05b1aed953..38665d0625031 100644
--- a/tests/test_modeling_tf_convnext.py
+++ b/tests/test_modeling_tf_convnext.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """ Testing suite for the TensorFlow ConvNext model. """
 
-import unittest
 import inspect
+import unittest
 
 from transformers import ConvNextConfig
 from transformers.file_utils import cached_property, is_tf_available, is_vision_available
@@ -96,10 +96,10 @@ def get_config(self):
     def create_and_check_model(self, config, pixel_values, labels):
         model = TFConvNextModel(config=config)
         result = model(pixel_values, training=False)
-        # expected last hidden states: B, H // 32, W // 32, C
+        # expected last hidden states: B, C, H // 32, W // 32
         self.parent.assertEqual(
             result.last_hidden_state.shape,
-            (self.batch_size, self.image_size // 32, self.image_size // 32, self.hidden_sizes[-1]),
+            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
         )
 
     def create_and_check_for_image_classification(self, config, pixel_values, labels):

From 15c916f416dfc5c820528f25c0257544847ceb50 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 21 Feb 2022 16:05:38 +0530
Subject: [PATCH 31/65] chore: added Joao as a contributor to convnext.

---
 docs/source/model_doc/convnext.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/model_doc/convnext.mdx b/docs/source/model_doc/convnext.mdx
index f2e789b36916a..4d46248565f94 100644
--- a/docs/source/model_doc/convnext.mdx
+++ b/docs/source/model_doc/convnext.mdx
@@ -37,8 +37,8 @@ alt="drawing" width="600"/>
 
 <small> ConvNeXT architecture. Taken from the <a href="https://arxiv.org/abs/2201.03545">original paper</a>.</small>
 
-This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [ariG23498](https://github.com/ariG23498)
-and [sayakpaul](https://github.com/sayakpaul) (equal contribution). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt).
+This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [ariG23498](https://github.com/ariG23498),
+[gante](https://github.com/gante), and [sayakpaul](https://github.com/sayakpaul) (equal contribution). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt).
 
 ## ConvNeXT specific outputs
 

From 05b8273708d5f33ff712a1325591e98bdcd285cc Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:18:41 +0530
Subject: [PATCH 32/65] rebasing

---
 playground.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 playground.py

diff --git a/playground.py b/playground.py
new file mode 100644
index 0000000000000..8a53d5babd2be
--- /dev/null
+++ b/playground.py
@@ -0,0 +1,38 @@
+import tensorflow as tf
+from transformers import AutoFeatureExtractor
+
+# import your TFConvNextForImageClassification class here, we will take care
+# of adding the boilerplate to run `from transformers import
+# TFConvNextForImageClassification` later
+from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
+from transformers import ConvNextForImageClassification
+
+from PIL import Image
+
+# model = ConvNextForImageClassification.from_pretrained(
+#     "facebook/convnext-tiny-224",
+# )
+# print(f"Model State Dict:\n")
+# all_keys = list(model.state_dict().keys())
+# print([k for k in all_keys if "layer_scale" in k])
+
+model = TFConvNextForImageClassification.from_pretrained(
+    "facebook/convnext-tiny-224",
+    from_pt=True,
+)  # notice the `from_pt` argument
+print(model.summary(expand_nested=True))
+
+
+feature_extractor = AutoFeatureExtractor.from_pretrained(
+    "facebook/convnext-tiny-224"
+)  # don't know if this is supposed to work with TF as well, change this as needed
+
+image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
+inputs = feature_extractor(images=image, return_tensors="tf")
+
+# forward pass
+outputs = model(**inputs)
+
+# verify the logits
+assert outputs.logits.shape == [1, 1000]
+tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From d247441d0a5d8e03d3959aed4e38869fbc3fe1f6 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:19:23 +0530
Subject: [PATCH 33/65] rebasing and removing playground.py.

---
 playground.py | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 playground.py

diff --git a/playground.py b/playground.py
deleted file mode 100644
index 8a53d5babd2be..0000000000000
--- a/playground.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import tensorflow as tf
-from transformers import AutoFeatureExtractor
-
-# import your TFConvNextForImageClassification class here, we will take care
-# of adding the boilerplate to run `from transformers import
-# TFConvNextForImageClassification` later
-from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
-from transformers import ConvNextForImageClassification
-
-from PIL import Image
-
-# model = ConvNextForImageClassification.from_pretrained(
-#     "facebook/convnext-tiny-224",
-# )
-# print(f"Model State Dict:\n")
-# all_keys = list(model.state_dict().keys())
-# print([k for k in all_keys if "layer_scale" in k])
-
-model = TFConvNextForImageClassification.from_pretrained(
-    "facebook/convnext-tiny-224",
-    from_pt=True,
-)  # notice the `from_pt` argument
-print(model.summary(expand_nested=True))
-
-
-feature_extractor = AutoFeatureExtractor.from_pretrained(
-    "facebook/convnext-tiny-224"
-)  # don't know if this is supposed to work with TF as well, change this as needed
-
-image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
-inputs = feature_extractor(images=image, return_tensors="tf")
-
-# forward pass
-outputs = model(**inputs)
-
-# verify the logits
-assert outputs.logits.shape == [1, 1000]
-tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From bb8e6c208363e837900a525e6921077e1c22ee76 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:18:41 +0530
Subject: [PATCH 34/65] rebasing

---
 playground.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 playground.py

diff --git a/playground.py b/playground.py
new file mode 100644
index 0000000000000..8a53d5babd2be
--- /dev/null
+++ b/playground.py
@@ -0,0 +1,38 @@
+import tensorflow as tf
+from transformers import AutoFeatureExtractor
+
+# import your TFConvNextForImageClassification class here, we will take care
+# of adding the boilerplate to run `from transformers import
+# TFConvNextForImageClassification` later
+from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
+from transformers import ConvNextForImageClassification
+
+from PIL import Image
+
+# model = ConvNextForImageClassification.from_pretrained(
+#     "facebook/convnext-tiny-224",
+# )
+# print(f"Model State Dict:\n")
+# all_keys = list(model.state_dict().keys())
+# print([k for k in all_keys if "layer_scale" in k])
+
+model = TFConvNextForImageClassification.from_pretrained(
+    "facebook/convnext-tiny-224",
+    from_pt=True,
+)  # notice the `from_pt` argument
+print(model.summary(expand_nested=True))
+
+
+feature_extractor = AutoFeatureExtractor.from_pretrained(
+    "facebook/convnext-tiny-224"
+)  # don't know if this is supposed to work with TF as well, change this as needed
+
+image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
+inputs = feature_extractor(images=image, return_tensors="tf")
+
+# forward pass
+outputs = model(**inputs)
+
+# verify the logits
+assert outputs.logits.shape == [1, 1000]
+tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From 3b5366d731aaf9048168534894b3d0b83a739ca6 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:19:23 +0530
Subject: [PATCH 35/65] rebasing and removing playground.py.

---
 playground.py | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 playground.py

diff --git a/playground.py b/playground.py
deleted file mode 100644
index 8a53d5babd2be..0000000000000
--- a/playground.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import tensorflow as tf
-from transformers import AutoFeatureExtractor
-
-# import your TFConvNextForImageClassification class here, we will take care
-# of adding the boilerplate to run `from transformers import
-# TFConvNextForImageClassification` later
-from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
-from transformers import ConvNextForImageClassification
-
-from PIL import Image
-
-# model = ConvNextForImageClassification.from_pretrained(
-#     "facebook/convnext-tiny-224",
-# )
-# print(f"Model State Dict:\n")
-# all_keys = list(model.state_dict().keys())
-# print([k for k in all_keys if "layer_scale" in k])
-
-model = TFConvNextForImageClassification.from_pretrained(
-    "facebook/convnext-tiny-224",
-    from_pt=True,
-)  # notice the `from_pt` argument
-print(model.summary(expand_nested=True))
-
-
-feature_extractor = AutoFeatureExtractor.from_pretrained(
-    "facebook/convnext-tiny-224"
-)  # don't know if this is supposed to work with TF as well, change this as needed
-
-image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
-inputs = feature_extractor(images=image, return_tensors="tf")
-
-# forward pass
-outputs = model(**inputs)
-
-# verify the logits
-assert outputs.logits.shape == [1, 1000]
-tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From 49b35cdade8d9349fbd3c8761a2b7658df1b7217 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 21 Feb 2022 20:18:45 +0530
Subject: [PATCH 36/65] chore: corrected copyright year and added comment on
 NHWC.

---
 .../models/convnext/modeling_tf_convnext.py   | 132 ++++++++++++++----
 tests/test_modeling_tf_convnext.py            |  84 ++++++++---
 2 files changed, 174 insertions(+), 42 deletions(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 2dd5f155da94a..c003d465e0aeb 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -21,8 +21,16 @@
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
-from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling, TFSequenceClassifierOutput
+from ...file_utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFBaseModelOutputWithPooling,
+    TFSequenceClassifierOutput,
+)
 from ...modeling_tf_utils import (
     TFModelInputType,
     TFPreTrainedModel,
@@ -77,7 +85,9 @@ def __init__(self, config, **kwargs):
             kernel_initializer=get_initializer(config.initializer_range),
             bias_initializer="zeros",
         )
-        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
+        self.layernorm = tf.keras.layers.LayerNormalization(
+            epsilon=1e-6, name="layernorm"
+        )
 
     def call(self, pixel_values):
         if isinstance(pixel_values, dict):
@@ -157,7 +167,9 @@ def build(self, input_shape: tf.TensorShape):
         self.layer_scale_parameter = (
             self.add_weight(
                 shape=(self.dim,),
-                initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                initializer=tf.keras.initializers.Constant(
+                    value=self.config.layer_scale_init_value
+                ),
                 trainable=True,
                 name="layer_scale_parameter",
             )
@@ -193,7 +205,15 @@ class TFConvNextStage(tf.keras.layers.Layer):
     """
 
     def __init__(
-        self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs
+        self,
+        config,
+        in_channels,
+        out_channels,
+        kernel_size=2,
+        stride=2,
+        depth=2,
+        drop_path_rates=None,
+        **kwargs
     ):
         super().__init__(**kwargs)
         if in_channels != out_channels or stride > 1:
@@ -202,11 +222,18 @@ def __init__(
                     epsilon=1e-6,
                     name="downsampling_layer.0",
                 ),
+                # Inputs to this layer will follow NHWC format since we
+                # transposed the inputs from NCHW to NHWC in the `TFConvNextEmbeddings`
+                # layer. All the outputs throughout the model will be in NHWC
+                # from this point on until the output where we again change to
+                # NCHW.
                 tf.keras.layers.Conv2D(
                     filters=out_channels,
                     kernel_size=kernel_size,
                     strides=stride,
-                    kernel_initializer=get_initializer(config.initializer_range),
+                    kernel_initializer=get_initializer(
+                        config.initializer_range
+                    ),
                     bias_initializer="zeros",
                     name="downsampling_layer.1",
                 ),
@@ -237,7 +264,10 @@ class TFConvNextEncoder(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
         self.stages = []
-        drop_path_rates = [x for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))]
+        drop_path_rates = [
+            x
+            for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))
+        ]
         cur = 0
         prev_chs = config.hidden_sizes[0]
         for i in range(config.num_stages):
@@ -268,7 +298,9 @@ def call(self, hidden_states, output_hidden_states=False, return_dict=True):
             all_hidden_states = all_hidden_states + (hidden_states,)
 
         if not return_dict:
-            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+            return tuple(
+                v for v in [hidden_states, all_hidden_states] if v is not None
+            )
 
         return TFBaseModelOutput(
             last_hidden_state=hidden_states,
@@ -280,14 +312,20 @@ def call(self, hidden_states, output_hidden_states=False, return_dict=True):
 class TFConvNextMainLayer(tf.keras.layers.Layer):
     config_class = ConvNextConfig
 
-    def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs):
+    def __init__(
+        self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs
+    ):
         super().__init__(**kwargs)
 
         self.config = config
         self.embeddings = TFConvNextEmbeddings(config, name="embeddings")
         self.encoder = TFConvNextEncoder(config, name="encoder")
-        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
-        self.pooler = tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None
+        self.layernorm = tf.keras.layers.LayerNormalization(
+            epsilon=config.layer_norm_eps, name="layernorm"
+        )
+        self.pooler = (
+            tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None
+        )
 
     def call(
         self,
@@ -298,9 +336,15 @@ def call(
         **kwargs,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict
+            if return_dict is not None
+            else self.config.use_return_dict
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         inputs = input_processing(
             func=self.call,
@@ -318,7 +362,9 @@ def call(
         if inputs["pixel_values"] is None:
             raise ValueError("You have to specify pixel_values")
 
-        embedding_output = self.embeddings(inputs["pixel_values"], training=inputs["training"])
+        embedding_output = self.embeddings(
+            inputs["pixel_values"], training=inputs["training"]
+        )
 
         encoder_outputs = self.encoder(
             embedding_output,
@@ -359,14 +405,22 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
             `Dict[str, tf.Tensor]`: The dummy inputs.
         """
         VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size), dtype=tf.float32
+            shape=(
+                3,
+                self.config.num_channels,
+                self.config.image_size,
+                self.config.image_size,
+            ),
+            dtype=tf.float32,
         )
         return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
 
     @tf.function(
         input_signature=[
             {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
+                "pixel_values": tf.TensorSpec(
+                    (None, None, None, None), tf.float32, name="pixel_values"
+                ),
             }
         ]
     )
@@ -431,10 +485,14 @@ def serving(self, inputs):
 class TFConvNextModel(TFConvNextPreTrainedModel):
     def __init__(self, config, *inputs, add_pooling_layer=True, **kwargs):
         super().__init__(config, *inputs, **kwargs)
-        self.convnext = TFConvNextMainLayer(config, add_pooling_layer=add_pooling_layer, name="convnext")
+        self.convnext = TFConvNextMainLayer(
+            config, add_pooling_layer=add_pooling_layer, name="convnext"
+        )
 
     @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(
+        output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC
+    )
     def call(
         self,
         pixel_values: Optional[TFModelInputType] = None,
@@ -464,9 +522,15 @@ def call(
         >>> last_hidden_states = outputs.last_hidden_state
         ```"""
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict
+            if return_dict is not None
+            else self.config.use_return_dict
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         inputs = input_processing(
             func=self.call,
@@ -496,7 +560,9 @@ def call(
             return (tf.transpose(outputs[0], perm=(0, 3, 1, 2)),) + outputs[1:]
 
         return TFBaseModelOutputWithPooling(
-            last_hidden_state=tf.transpose(outputs.last_hidden_state, perm=(0, 3, 1, 2)),
+            last_hidden_state=tf.transpose(
+                outputs.last_hidden_state, perm=(0, 3, 1, 2)
+            ),
             pooler_output=outputs.pooler_output,
             hidden_states=outputs.hidden_states,
         )
@@ -509,7 +575,9 @@ def call(
     """,
     CONVNEXT_START_DOCSTRING,
 )
-class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClassificationLoss):
+class TFConvNextForImageClassification(
+    TFConvNextPreTrainedModel, TFSequenceClassificationLoss
+):
     def __init__(self, config: ConvNextConfig, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
@@ -525,7 +593,9 @@ def __init__(self, config: ConvNextConfig, *inputs, **kwargs):
         )
 
     @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(
+        output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC
+    )
     def call(
         self,
         pixel_values: Optional[TFModelInputType] = None,
@@ -565,9 +635,15 @@ def call(
         >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
         ```"""
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict
+            if return_dict is not None
+            else self.config.use_return_dict
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         inputs = input_processing(
             func=self.call,
@@ -596,7 +672,11 @@ def call(
         pooled_output = outputs.pooler_output if return_dict else outputs[1]
 
         logits = self.classifier(pooled_output)
-        loss = None if inputs["labels"] is None else self.hf_compute_loss(labels=inputs["labels"], logits=logits)
+        loss = (
+            None
+            if inputs["labels"] is None
+            else self.hf_compute_loss(labels=inputs["labels"], logits=logits)
+        )
 
         if not inputs["return_dict"]:
             output = (logits,) + outputs[2:]
diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py
index 38665d0625031..ed597f92657ed 100644
--- a/tests/test_modeling_tf_convnext.py
+++ b/tests/test_modeling_tf_convnext.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,11 +18,19 @@
 import unittest
 
 from transformers import ConvNextConfig
-from transformers.file_utils import cached_property, is_tf_available, is_vision_available
+from transformers.file_utils import (
+    cached_property,
+    is_tf_available,
+    is_vision_available,
+)
 from transformers.testing_utils import require_tf, require_vision, slow
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from .test_modeling_tf_common import (
+    TFModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+)
 
 
 if is_tf_available():
@@ -72,11 +80,20 @@ def __init__(
         self.scope = scope
 
     def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        pixel_values = floats_tensor(
+            [
+                self.batch_size,
+                self.num_channels,
+                self.image_size,
+                self.image_size,
+            ]
+        )
 
         labels = None
         if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            labels = ids_tensor(
+                [self.batch_size], self.type_sequence_label_size
+            )
 
         config = self.get_config()
 
@@ -99,14 +116,24 @@ def create_and_check_model(self, config, pixel_values, labels):
         # expected last hidden states: B, C, H // 32, W // 32
         self.parent.assertEqual(
             result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
+            (
+                self.batch_size,
+                self.hidden_sizes[-1],
+                self.image_size // 32,
+                self.image_size // 32,
+            ),
         )
 
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+    def create_and_check_for_image_classification(
+        self, config, pixel_values, labels
+    ):
         config.num_labels = self.type_sequence_label_size
         model = TFConvNextForImageClassification(config)
         result = model(pixel_values, labels=labels, training=False)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+        self.parent.assertEqual(
+            result.logits.shape,
+            (self.batch_size, self.type_sequence_label_size),
+        )
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -138,13 +165,20 @@ class TFConvNextModelTest(TFModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = TFConvNextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ConvNextConfig, has_text_modality=False, hidden_size=37)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=ConvNextConfig,
+            has_text_modality=False,
+            hidden_size=37,
+        )
 
     @unittest.skip(reason="ConvNext does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="ConvNext does not support input and output embeddings")
+    @unittest.skip(
+        reason="ConvNext does not support input and output embeddings"
+    )
     def test_model_common_attributes(self):
         pass
 
@@ -173,7 +207,11 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
 
             outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+            hidden_states = (
+                outputs.encoder_hidden_states
+                if config.is_encoder_decoder
+                else outputs.hidden_states
+            )
 
             expected_num_stages = self.model_tester.num_stages
             self.assertEqual(len(hidden_states), expected_num_stages + 1)
@@ -181,10 +219,16 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             # ConvNext's feature maps are of shape (batch_size, height, width, num_channels) in TF
             self.assertListEqual(
                 list(hidden_states[0].shape[1:-1]),
-                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
+                [
+                    self.model_tester.image_size // 4,
+                    self.model_tester.image_size // 4,
+                ],
             )
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             inputs_dict["output_hidden_states"] = True
@@ -198,7 +242,9 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
     def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+        self.model_tester.create_and_check_for_image_classification(
+            *config_and_inputs
+        )
 
     @slow
     def test_model_from_pretrained(self):
@@ -218,7 +264,11 @@ class TFConvNextModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
         return (
-            ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
+            ConvNextFeatureExtractor.from_pretrained(
+                "facebook/convnext-tiny-224"
+            )
+            if is_vision_available()
+            else None
         )
 
     @slow
@@ -241,4 +291,6 @@ def test_inference_image_classification_head(self):
 
         expected_slice = tf.constant([-0.0260, -0.4739, 0.1911])
 
-        tf.debugging.assert_near(outputs.logits[0, :3], expected_slice, atol=1e-4)
+        tf.debugging.assert_near(
+            outputs.logits[0, :3], expected_slice, atol=1e-4
+        )

From d9b507935a98db016c77e3b094d55ee20f6b70bd Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 21 Feb 2022 20:59:48 +0530
Subject: [PATCH 37/65] chore: fixed the black version and ran formatting.

---
 .../models/convnext/modeling_tf_convnext.py   | 107 ++++--------------
 tests/test_modeling_tf_convnext.py            |  32 ++----
 2 files changed, 30 insertions(+), 109 deletions(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index c003d465e0aeb..950891db17730 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -85,9 +85,7 @@ def __init__(self, config, **kwargs):
             kernel_initializer=get_initializer(config.initializer_range),
             bias_initializer="zeros",
         )
-        self.layernorm = tf.keras.layers.LayerNormalization(
-            epsilon=1e-6, name="layernorm"
-        )
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
 
     def call(self, pixel_values):
         if isinstance(pixel_values, dict):
@@ -167,9 +165,7 @@ def build(self, input_shape: tf.TensorShape):
         self.layer_scale_parameter = (
             self.add_weight(
                 shape=(self.dim,),
-                initializer=tf.keras.initializers.Constant(
-                    value=self.config.layer_scale_init_value
-                ),
+                initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
                 trainable=True,
                 name="layer_scale_parameter",
             )
@@ -205,15 +201,7 @@ class TFConvNextStage(tf.keras.layers.Layer):
     """
 
     def __init__(
-        self,
-        config,
-        in_channels,
-        out_channels,
-        kernel_size=2,
-        stride=2,
-        depth=2,
-        drop_path_rates=None,
-        **kwargs
+        self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs
     ):
         super().__init__(**kwargs)
         if in_channels != out_channels or stride > 1:
@@ -231,9 +219,7 @@ def __init__(
                     filters=out_channels,
                     kernel_size=kernel_size,
                     strides=stride,
-                    kernel_initializer=get_initializer(
-                        config.initializer_range
-                    ),
+                    kernel_initializer=get_initializer(config.initializer_range),
                     bias_initializer="zeros",
                     name="downsampling_layer.1",
                 ),
@@ -264,10 +250,7 @@ class TFConvNextEncoder(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
         self.stages = []
-        drop_path_rates = [
-            x
-            for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))
-        ]
+        drop_path_rates = [x for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))]
         cur = 0
         prev_chs = config.hidden_sizes[0]
         for i in range(config.num_stages):
@@ -298,9 +281,7 @@ def call(self, hidden_states, output_hidden_states=False, return_dict=True):
             all_hidden_states = all_hidden_states + (hidden_states,)
 
         if not return_dict:
-            return tuple(
-                v for v in [hidden_states, all_hidden_states] if v is not None
-            )
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
 
         return TFBaseModelOutput(
             last_hidden_state=hidden_states,
@@ -312,20 +293,14 @@ def call(self, hidden_states, output_hidden_states=False, return_dict=True):
 class TFConvNextMainLayer(tf.keras.layers.Layer):
     config_class = ConvNextConfig
 
-    def __init__(
-        self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs
-    ):
+    def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs):
         super().__init__(**kwargs)
 
         self.config = config
         self.embeddings = TFConvNextEmbeddings(config, name="embeddings")
         self.encoder = TFConvNextEncoder(config, name="encoder")
-        self.layernorm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name="layernorm"
-        )
-        self.pooler = (
-            tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None
-        )
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+        self.pooler = tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None
 
     def call(
         self,
@@ -336,15 +311,9 @@ def call(
         **kwargs,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
         output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict
-            if return_dict is not None
-            else self.config.use_return_dict
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         inputs = input_processing(
             func=self.call,
@@ -362,9 +331,7 @@ def call(
         if inputs["pixel_values"] is None:
             raise ValueError("You have to specify pixel_values")
 
-        embedding_output = self.embeddings(
-            inputs["pixel_values"], training=inputs["training"]
-        )
+        embedding_output = self.embeddings(inputs["pixel_values"], training=inputs["training"])
 
         encoder_outputs = self.encoder(
             embedding_output,
@@ -418,9 +385,7 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
     @tf.function(
         input_signature=[
             {
-                "pixel_values": tf.TensorSpec(
-                    (None, None, None, None), tf.float32, name="pixel_values"
-                ),
+                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
             }
         ]
     )
@@ -485,14 +450,10 @@ def serving(self, inputs):
 class TFConvNextModel(TFConvNextPreTrainedModel):
     def __init__(self, config, *inputs, add_pooling_layer=True, **kwargs):
         super().__init__(config, *inputs, **kwargs)
-        self.convnext = TFConvNextMainLayer(
-            config, add_pooling_layer=add_pooling_layer, name="convnext"
-        )
+        self.convnext = TFConvNextMainLayer(config, add_pooling_layer=add_pooling_layer, name="convnext")
 
     @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC
-    )
+    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
         pixel_values: Optional[TFModelInputType] = None,
@@ -522,15 +483,9 @@ def call(
         >>> last_hidden_states = outputs.last_hidden_state
         ```"""
         output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict
-            if return_dict is not None
-            else self.config.use_return_dict
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         inputs = input_processing(
             func=self.call,
@@ -560,9 +515,7 @@ def call(
             return (tf.transpose(outputs[0], perm=(0, 3, 1, 2)),) + outputs[1:]
 
         return TFBaseModelOutputWithPooling(
-            last_hidden_state=tf.transpose(
-                outputs.last_hidden_state, perm=(0, 3, 1, 2)
-            ),
+            last_hidden_state=tf.transpose(outputs.last_hidden_state, perm=(0, 3, 1, 2)),
             pooler_output=outputs.pooler_output,
             hidden_states=outputs.hidden_states,
         )
@@ -575,9 +528,7 @@ def call(
     """,
     CONVNEXT_START_DOCSTRING,
 )
-class TFConvNextForImageClassification(
-    TFConvNextPreTrainedModel, TFSequenceClassificationLoss
-):
+class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClassificationLoss):
     def __init__(self, config: ConvNextConfig, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
@@ -593,9 +544,7 @@ def __init__(self, config: ConvNextConfig, *inputs, **kwargs):
         )
 
     @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC
-    )
+    @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
         pixel_values: Optional[TFModelInputType] = None,
@@ -635,15 +584,9 @@ def call(
         >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
         ```"""
         output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict
-            if return_dict is not None
-            else self.config.use_return_dict
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         inputs = input_processing(
             func=self.call,
@@ -672,11 +615,7 @@ def call(
         pooled_output = outputs.pooler_output if return_dict else outputs[1]
 
         logits = self.classifier(pooled_output)
-        loss = (
-            None
-            if inputs["labels"] is None
-            else self.hf_compute_loss(labels=inputs["labels"], logits=logits)
-        )
+        loss = None if inputs["labels"] is None else self.hf_compute_loss(labels=inputs["labels"], logits=logits)
 
         if not inputs["return_dict"]:
             output = (logits,) + outputs[2:]
diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py
index ed597f92657ed..df7c6278d8038 100644
--- a/tests/test_modeling_tf_convnext.py
+++ b/tests/test_modeling_tf_convnext.py
@@ -91,9 +91,7 @@ def prepare_config_and_inputs(self):
 
         labels = None
         if self.use_labels:
-            labels = ids_tensor(
-                [self.batch_size], self.type_sequence_label_size
-            )
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
 
         config = self.get_config()
 
@@ -124,9 +122,7 @@ def create_and_check_model(self, config, pixel_values, labels):
             ),
         )
 
-    def create_and_check_for_image_classification(
-        self, config, pixel_values, labels
-    ):
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
         config.num_labels = self.type_sequence_label_size
         model = TFConvNextForImageClassification(config)
         result = model(pixel_values, labels=labels, training=False)
@@ -176,9 +172,7 @@ def setUp(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(
-        reason="ConvNext does not support input and output embeddings"
-    )
+    @unittest.skip(reason="ConvNext does not support input and output embeddings")
     def test_model_common_attributes(self):
         pass
 
@@ -207,11 +201,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
 
             outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            hidden_states = (
-                outputs.encoder_hidden_states
-                if config.is_encoder_decoder
-                else outputs.hidden_states
-            )
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
 
             expected_num_stages = self.model_tester.num_stages
             self.assertEqual(len(hidden_states), expected_num_stages + 1)
@@ -242,9 +232,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
     def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(
-            *config_and_inputs
-        )
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
 
     @slow
     def test_model_from_pretrained(self):
@@ -264,11 +252,7 @@ class TFConvNextModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
         return (
-            ConvNextFeatureExtractor.from_pretrained(
-                "facebook/convnext-tiny-224"
-            )
-            if is_vision_available()
-            else None
+            ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
         )
 
     @slow
@@ -291,6 +275,4 @@ def test_inference_image_classification_head(self):
 
         expected_slice = tf.constant([-0.0260, -0.4739, 0.1911])
 
-        tf.debugging.assert_near(
-            outputs.logits[0, :3], expected_slice, atol=1e-4
-        )
+        tf.debugging.assert_near(outputs.logits[0, :3], expected_slice, atol=1e-4)

From 4b4737f53693f476352a8848de7683138c3ff8aa Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 21 Feb 2022 22:46:29 +0530
Subject: [PATCH 38/65] chore: ran make style.

---
 .../models/convnext/modeling_tf_convnext.py          | 12 ++----------
 tests/test_modeling_tf_convnext.py                   | 12 ++----------
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 950891db17730..328194dddbc2c 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -21,16 +21,8 @@
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
-from ...modeling_tf_outputs import (
-    TFBaseModelOutput,
-    TFBaseModelOutputWithPooling,
-    TFSequenceClassifierOutput,
-)
+from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling, TFSequenceClassifierOutput
 from ...modeling_tf_utils import (
     TFModelInputType,
     TFPreTrainedModel,
diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py
index df7c6278d8038..52cbf02f5a549 100644
--- a/tests/test_modeling_tf_convnext.py
+++ b/tests/test_modeling_tf_convnext.py
@@ -18,19 +18,11 @@
 import unittest
 
 from transformers import ConvNextConfig
-from transformers.file_utils import (
-    cached_property,
-    is_tf_available,
-    is_vision_available,
-)
+from transformers.file_utils import cached_property, is_tf_available, is_vision_available
 from transformers.testing_utils import require_tf, require_vision, slow
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import (
-    TFModelTesterMixin,
-    floats_tensor,
-    ids_tensor,
-)
+from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
 
 
 if is_tf_available():

From 2322a5f4f660c82fd3a1afe8fd6fdb7e44ab5c2f Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 22 Feb 2022 10:27:25 +0530
Subject: [PATCH 39/65] chore: removed from_pt argument from test, ran make
 style.

---
 tests/test_modeling_tf_convnext.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py
index 52cbf02f5a549..233ec6662b820 100644
--- a/tests/test_modeling_tf_convnext.py
+++ b/tests/test_modeling_tf_convnext.py
@@ -249,10 +249,7 @@ def default_feature_extractor(self):
 
     @slow
     def test_inference_image_classification_head(self):
-        model = TFConvNextForImageClassification.from_pretrained(
-            "facebook/convnext-tiny-224",
-            from_pt=True,
-        )
+        model = TFConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224")
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()

From 61ae121e451d52e72d66adaf511d65af835c4c8e Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:18:41 +0530
Subject: [PATCH 40/65] rebasing

---
 playground.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 playground.py

diff --git a/playground.py b/playground.py
new file mode 100644
index 0000000000000..8a53d5babd2be
--- /dev/null
+++ b/playground.py
@@ -0,0 +1,38 @@
+import tensorflow as tf
+from transformers import AutoFeatureExtractor
+
+# import your TFConvNextForImageClassification class here, we will take care
+# of adding the boilerplate to run `from transformers import
+# TFConvNextForImageClassification` later
+from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
+from transformers import ConvNextForImageClassification
+
+from PIL import Image
+
+# model = ConvNextForImageClassification.from_pretrained(
+#     "facebook/convnext-tiny-224",
+# )
+# print(f"Model State Dict:\n")
+# all_keys = list(model.state_dict().keys())
+# print([k for k in all_keys if "layer_scale" in k])
+
+model = TFConvNextForImageClassification.from_pretrained(
+    "facebook/convnext-tiny-224",
+    from_pt=True,
+)  # notice the `from_pt` argument
+print(model.summary(expand_nested=True))
+
+
+feature_extractor = AutoFeatureExtractor.from_pretrained(
+    "facebook/convnext-tiny-224"
+)  # don't know if this is supposed to work with TF as well, change this as needed
+
+image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
+inputs = feature_extractor(images=image, return_tensors="tf")
+
+# forward pass
+outputs = model(**inputs)
+
+# verify the logits
+assert outputs.logits.shape == [1, 1000]
+tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From b5683772e3ec8ac03cd23e50ffa17ba18f983b58 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:19:23 +0530
Subject: [PATCH 41/65] rebasing and removing playground.py.

---
 playground.py | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 playground.py

diff --git a/playground.py b/playground.py
deleted file mode 100644
index 8a53d5babd2be..0000000000000
--- a/playground.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import tensorflow as tf
-from transformers import AutoFeatureExtractor
-
-# import your TFConvNextForImageClassification class here, we will take care
-# of adding the boilerplate to run `from transformers import
-# TFConvNextForImageClassification` later
-from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
-from transformers import ConvNextForImageClassification
-
-from PIL import Image
-
-# model = ConvNextForImageClassification.from_pretrained(
-#     "facebook/convnext-tiny-224",
-# )
-# print(f"Model State Dict:\n")
-# all_keys = list(model.state_dict().keys())
-# print([k for k in all_keys if "layer_scale" in k])
-
-model = TFConvNextForImageClassification.from_pretrained(
-    "facebook/convnext-tiny-224",
-    from_pt=True,
-)  # notice the `from_pt` argument
-print(model.summary(expand_nested=True))
-
-
-feature_extractor = AutoFeatureExtractor.from_pretrained(
-    "facebook/convnext-tiny-224"
-)  # don't know if this is supposed to work with TF as well, change this as needed
-
-image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
-inputs = feature_extractor(images=image, return_tensors="tf")
-
-# forward pass
-outputs = model(**inputs)
-
-# verify the logits
-assert outputs.logits.shape == [1, 1000]
-tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From 1259bf8b37e08ab0f05f46991c6e0571a761027f Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:18:41 +0530
Subject: [PATCH 42/65] rebasing

---
 playground.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 playground.py

diff --git a/playground.py b/playground.py
new file mode 100644
index 0000000000000..8a53d5babd2be
--- /dev/null
+++ b/playground.py
@@ -0,0 +1,38 @@
+import tensorflow as tf
+from transformers import AutoFeatureExtractor
+
+# import your TFConvNextForImageClassification class here, we will take care
+# of adding the boilerplate to run `from transformers import
+# TFConvNextForImageClassification` later
+from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
+from transformers import ConvNextForImageClassification
+
+from PIL import Image
+
+# model = ConvNextForImageClassification.from_pretrained(
+#     "facebook/convnext-tiny-224",
+# )
+# print(f"Model State Dict:\n")
+# all_keys = list(model.state_dict().keys())
+# print([k for k in all_keys if "layer_scale" in k])
+
+model = TFConvNextForImageClassification.from_pretrained(
+    "facebook/convnext-tiny-224",
+    from_pt=True,
+)  # notice the `from_pt` argument
+print(model.summary(expand_nested=True))
+
+
+feature_extractor = AutoFeatureExtractor.from_pretrained(
+    "facebook/convnext-tiny-224"
+)  # don't know if this is supposed to work with TF as well, change this as needed
+
+image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
+inputs = feature_extractor(images=image, return_tensors="tf")
+
+# forward pass
+outputs = model(**inputs)
+
+# verify the logits
+assert outputs.logits.shape == [1, 1000]
+tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From 96c1ea4e9a7249bf0a1e8cbf6d682e1c82e9cd24 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:19:23 +0530
Subject: [PATCH 43/65] rebasing and removing playground.py.

---
 playground.py | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 playground.py

diff --git a/playground.py b/playground.py
deleted file mode 100644
index 8a53d5babd2be..0000000000000
--- a/playground.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import tensorflow as tf
-from transformers import AutoFeatureExtractor
-
-# import your TFConvNextForImageClassification class here, we will take care
-# of adding the boilerplate to run `from transformers import
-# TFConvNextForImageClassification` later
-from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
-from transformers import ConvNextForImageClassification
-
-from PIL import Image
-
-# model = ConvNextForImageClassification.from_pretrained(
-#     "facebook/convnext-tiny-224",
-# )
-# print(f"Model State Dict:\n")
-# all_keys = list(model.state_dict().keys())
-# print([k for k in all_keys if "layer_scale" in k])
-
-model = TFConvNextForImageClassification.from_pretrained(
-    "facebook/convnext-tiny-224",
-    from_pt=True,
-)  # notice the `from_pt` argument
-print(model.summary(expand_nested=True))
-
-
-feature_extractor = AutoFeatureExtractor.from_pretrained(
-    "facebook/convnext-tiny-224"
-)  # don't know if this is supposed to work with TF as well, change this as needed
-
-image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
-inputs = feature_extractor(images=image, return_tensors="tf")
-
-# forward pass
-outputs = model(**inputs)
-
-# verify the logits
-assert outputs.logits.shape == [1, 1000]
-tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From b1972164638903e57f0a8343d04924753e1ecf6c Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 24 Feb 2022 12:22:50 +0530
Subject: [PATCH 44/65] fix: tests in the convnext subclass, ran make style.

---
 tests/test_modeling_tf_common.py   | 419 +++++++++++++++++++++++------
 tests/test_modeling_tf_convnext.py |  45 ++++
 2 files changed, 378 insertions(+), 86 deletions(-)

diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index e072b4febd90b..2038f29e56cf8 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -83,7 +83,8 @@
             # Restrict TensorFlow to only allocate x GB of memory on the GPUs
             try:
                 tf.config.set_logical_device_configuration(
-                    gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)]
+                    gpu,
+                    [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)],
                 )
                 logical_gpus = tf.config.list_logical_devices("GPU")
                 print("Logical GPUs", logical_gpus)
@@ -116,7 +117,10 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d
 
         if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
             inputs_dict = {
-                k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
+                k: tf.tile(
+                    tf.expand_dims(v, 1),
+                    (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1),
+                )
                 if isinstance(v, tf.Tensor) and v.ndim > 0
                 else v
                 for k, v in inputs_dict.items()
@@ -144,7 +148,11 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d
                 *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING),
             ]:
                 inputs_dict["labels"] = tf.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
+                    (
+                        self.model_tester.batch_size,
+                        self.model_tester.seq_length,
+                    ),
+                    dtype=tf.int32,
                 )
         return inputs_dict
 
@@ -152,7 +160,10 @@ def test_initialization(self):
         pass
 
     def test_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -166,7 +177,10 @@ def test_save_load(self):
                 self.assert_outputs_same(after_outputs, outputs)
 
     def test_save_load_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -218,7 +232,10 @@ def test_onnx_compliancy(self):
         if not self.test_onnx:
             return
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         INTERNAL_OPS = [
             "Assert",
             "AssignVariableOp",
@@ -265,7 +282,10 @@ def test_onnx_runtime_optimize(self):
         import onnxruntime
         import tf2onnx
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -276,7 +296,10 @@ def test_onnx_runtime_optimize(self):
             onnxruntime.InferenceSession(onnx_model_proto.SerializeToString())
 
     def test_keras_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         tf_main_layer_classes = set(
             module_member
@@ -321,7 +344,8 @@ def test_keras_save_load(self):
                     )
                 else:
                     model = tf.keras.models.load_model(
-                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
+                        filepath,
+                        custom_objects={main_layer_class.__name__: main_layer_class},
                     )
                 assert isinstance(model, tf.keras.Model)
                 after_outputs = model(inputs_dict)
@@ -348,7 +372,10 @@ def test_pt_tf_model_equivalence(self):
 
         import transformers
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
@@ -361,7 +388,9 @@ def test_pt_tf_model_equivalence(self):
 
             # Check we can load pt model in tf and vice-versa with model => model functions
             tf_model = transformers.load_pytorch_model_in_tf2_model(
-                tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class)
+                tf_model,
+                pt_model,
+                tf_inputs=self._prepare_for_class(inputs_dict, model_class),
             )
             pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
 
@@ -382,7 +411,10 @@ def test_pt_tf_model_equivalence(self):
 
             with torch.no_grad():
                 pto = pt_model(**pt_inputs_dict)
-            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False)
+            tfo = tf_model(
+                self._prepare_for_class(inputs_dict, model_class),
+                training=False,
+            )
 
             tf_hidden_states = tfo[0].numpy()
             pt_hidden_states = pto[0].numpy()
@@ -441,14 +473,20 @@ def test_pt_tf_model_equivalence(self):
             self.assertLessEqual(max_diff, 4e-2)
 
     def test_compile_tf_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         max_input = getattr(self.model_tester, "max_position_embeddings", 512)
         optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
         loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
         metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
 
         for model_class in self.all_model_classes:
-            if model_class.__name__ in ["TFSpeech2TextModel", "TFSpeech2TextForConditionalGeneration"]:
+            if model_class.__name__ in [
+                "TFSpeech2TextModel",
+                "TFSpeech2TextForConditionalGeneration",
+            ]:
                 inputs = {
                     "decoder_input_ids": tf.keras.Input(
                         batch_shape=(2, max_input),
@@ -472,7 +510,11 @@ def test_compile_tf_model(self):
                         name="decoder_input_ids",
                         dtype="int32",
                     ),
-                    "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
+                    "input_ids": tf.keras.Input(
+                        batch_shape=(2, max_input),
+                        name="input_ids",
+                        dtype="int32",
+                    ),
                 }
             # `pixel_values` implies that the input is an image
             elif model_class.main_input_name == "pixel_values":
@@ -488,7 +530,11 @@ def test_compile_tf_model(self):
                 )
             elif model_class.__name__ in ["TFCLIPModel"]:
                 inputs = {
-                    "input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"),
+                    "input_ids": tf.keras.Input(
+                        batch_shape=(3, max_input),
+                        name="input_ids",
+                        dtype="int32",
+                    ),
                     "pixel_values": tf.keras.Input(
                         batch_shape=(
                             3,
@@ -501,7 +547,11 @@ def test_compile_tf_model(self):
                     ),
                 }
             elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32")
+                inputs = tf.keras.Input(
+                    batch_shape=(4, 2, max_input),
+                    name="input_ids",
+                    dtype="int32",
+                )
             else:
                 inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
 
@@ -524,7 +574,10 @@ def test_compile_tf_model(self):
             extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 
     def test_keyword_and_dict_args(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -540,10 +593,21 @@ def test_keyword_and_dict_args(self):
             self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
 
     def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
+        decoder_seq_length = getattr(
+            self.model_tester,
+            "decoder_seq_length",
+            self.model_tester.seq_length,
+        )
+        encoder_seq_length = getattr(
+            self.model_tester,
+            "encoder_seq_length",
+            self.model_tester.seq_length,
+        )
         decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
         encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
 
@@ -554,7 +618,11 @@ def check_decoder_attentions_output(outputs):
             self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
             self.assertListEqual(
                 list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                [
+                    self.model_tester.num_attention_heads,
+                    decoder_seq_length,
+                    decoder_key_length,
+                ],
             )
 
         def check_encoder_attentions_output(outputs):
@@ -564,7 +632,11 @@ def check_encoder_attentions_output(outputs):
             self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
             self.assertListEqual(
                 list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                [
+                    self.model_tester.num_attention_heads,
+                    encoder_seq_length,
+                    encoder_key_length,
+                ],
             )
 
         for model_class in self.all_model_classes:
@@ -606,7 +678,10 @@ def test_headmasking(self):
             return
 
         random.Random().seed(42)
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         random.Random().seed()
 
         inputs_dict["output_attentions"] = True
@@ -619,11 +694,19 @@ def test_headmasking(self):
             def prepare_layer_head_mask(i, attention_heads, num_hidden_layers):
                 if i == 0:
                     return tf.concat(
-                        (tf.zeros(1, dtype=tf.float32), tf.ones(attention_heads - 1, dtype=tf.float32)), 0
+                        (
+                            tf.zeros(1, dtype=tf.float32),
+                            tf.ones(attention_heads - 1, dtype=tf.float32),
+                        ),
+                        0,
                     )
                 elif i == num_hidden_layers - 1:
                     return tf.concat(
-                        (tf.zeros(attention_heads - 1, dtype=tf.float32), tf.ones(1, dtype=tf.float32)), 0
+                        (
+                            tf.zeros(attention_heads - 1, dtype=tf.float32),
+                            tf.ones(1, dtype=tf.float32),
+                        ),
+                        0,
                     )
                 else:
                     return tf.ones(attention_heads, dtype=tf.float32)
@@ -652,7 +735,8 @@ def check_attentions_validity(attentions):
                 # Remove Nan
                 for t in attentions:
                     self.assertLess(
-                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), (tf.size(t) / 4).numpy()
+                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(),
+                        (tf.size(t) / 4).numpy(),
                     )  # Check we don't have more than 25% nans (arbitrary)
 
                 attentions = [
@@ -660,11 +744,23 @@ def check_attentions_validity(attentions):
                 ]  # remove them (the test is less complete)
 
                 self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0)
-                self.assertNotEqual(tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), 0.0)
+                self.assertNotEqual(
+                    tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(),
+                    0.0,
+                )
                 if len(attentions) > 2:  # encoder-decodere models have only 2 layers in each modules
-                    self.assertNotEqual(tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), 0.0)
-                self.assertAlmostEqual(tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), 0.0)
-                self.assertNotEqual(tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), 0.0)
+                    self.assertNotEqual(
+                        tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(),
+                        0.0,
+                    )
+                self.assertAlmostEqual(
+                    tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(),
+                    0.0,
+                )
+                self.assertNotEqual(
+                    tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(),
+                    0.0,
+                )
 
             if model.config.is_encoder_decoder:
                 check_attentions_validity(outputs.encoder_attentions)
@@ -675,13 +771,18 @@ def check_attentions_validity(attentions):
                 check_attentions_validity(outputs.attentions)
 
     def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_hidden_states_output(config, inputs_dict, model_class):
             model = model_class(config)
             outputs = model(self._prepare_for_class(inputs_dict, model_class))
             expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+                self.model_tester,
+                "expected_num_hidden_layers",
+                self.model_tester.num_hidden_layers + 1,
             )
 
             if model.config.is_encoder_decoder:
@@ -692,12 +793,18 @@ def check_hidden_states_output(config, inputs_dict, model_class):
                 self.assertEqual(len(encoder_hidden_states), expected_num_layers)
                 self.assertListEqual(
                     list(encoder_hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                    [
+                        self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
                 )
                 self.assertEqual(len(decoder_hidden_states), expected_num_layers)
                 self.assertListEqual(
                     list(decoder_hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                    [
+                        self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
                 )
             else:
                 hidden_states = outputs.hidden_states
@@ -705,7 +812,10 @@ def check_hidden_states_output(config, inputs_dict, model_class):
                 self.assertEqual(len(hidden_states), expected_num_layers)
                 self.assertListEqual(
                     list(hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                    [
+                        self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
                 )
 
         for model_class in self.all_model_classes:
@@ -717,7 +827,10 @@ def check_hidden_states_output(config, inputs_dict, model_class):
             check_hidden_states_output(config, inputs_dict, model_class)
 
     def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         text_in_text_out_models = (
             get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
             + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
@@ -747,13 +860,22 @@ def test_model_common_attributes(self):
                 assert name is None
 
     def test_determinism(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
             first, second = (
-                model(self._prepare_for_class(inputs_dict, model_class), training=False)[0],
-                model(self._prepare_for_class(inputs_dict, model_class), training=False)[0],
+                model(
+                    self._prepare_for_class(inputs_dict, model_class),
+                    training=False,
+                )[0],
+                model(
+                    self._prepare_for_class(inputs_dict, model_class),
+                    training=False,
+                )[0],
             )
             out_1 = first.numpy()
             out_2 = second.numpy()
@@ -764,7 +886,10 @@ def test_determinism(self):
 
     def test_model_outputs_equivalence(self):
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
             tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
@@ -799,30 +924,32 @@ def recursive_check(tuple_object, dict_object):
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
             check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
 
-            # Pure conv models (such as ConvNeXt) don't have `output_attentions`.
-            if config.output_attentions:
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
 
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
 
-            if config.output_attentions:
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
 
-            if config.output_attentions:
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(
-                    model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
-                )
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(
+                model,
+                tuple_inputs,
+                dict_inputs,
+                {"output_hidden_states": True, "output_attentions": True},
+            )
 
     def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -849,7 +976,10 @@ def test_inputs_embeds(self):
             model(inputs)
 
     def test_numpy_arrays_inputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         def prepare_numpy_arrays(inputs_dict):
             inputs_np_dict = {}
@@ -874,7 +1004,10 @@ def prepare_numpy_arrays(inputs_dict):
     def test_resize_token_embeddings(self):
         if not self.test_resize_embeddings:
             return
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         def _get_word_embedding_weight(model, embedding_layer):
             embeds = getattr(embedding_layer, "weight", None)
@@ -933,16 +1066,25 @@ def _get_word_embedding_weight(model, embedding_layer):
 
                 if old_output_embeddings is not None and new_output_embeddings is not None:
                     self.assertEqual(new_output_embeddings.shape[0], assert_size)
-                    self.assertEqual(new_output_embeddings.shape[1], old_output_embeddings.shape[1])
+                    self.assertEqual(
+                        new_output_embeddings.shape[1],
+                        old_output_embeddings.shape[1],
+                    )
 
                     models_equal = True
-                    for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
+                    for p1, p2 in zip(
+                        old_output_embeddings.value(),
+                        new_output_embeddings.value(),
+                    ):
                         if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
                             models_equal = False
                     self.assertTrue(models_equal)
 
     def test_lm_head_model_random_no_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
 
         # iterate over all generative models
@@ -969,16 +1111,25 @@ def test_lm_head_model_random_no_beam_search_generate(self):
 
             # check bad words tokens language generation
             # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
+            bad_words_ids = [
+                self._generate_random_bad_tokens(1, model),
+                self._generate_random_bad_tokens(2, model),
+            ]
             output_tokens = model.generate(
-                input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
+                input_ids,
+                do_sample=True,
+                bad_words_ids=bad_words_ids,
+                num_return_sequences=2,
             )
             # only count generated tokens
             generated_ids = output_tokens[:, input_ids.shape[-1] :]
             self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
 
     def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
         if input_ids is None:
             input_ids = inputs_dict.get("input_features", None)
@@ -1011,7 +1162,10 @@ def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
                 self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput)
 
     def test_lm_head_model_random_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
 
         for model_class in self.all_generative_model_classes:
@@ -1026,7 +1180,12 @@ def test_lm_head_model_random_beam_search_generate(self):
 
             with self.assertRaises(AssertionError):
                 # generating more sequences than having beams leads is not possible
-                model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
+                model.generate(
+                    input_ids,
+                    do_sample=False,
+                    num_return_sequences=3,
+                    num_beams=2,
+                )
 
             # num_return_sequences > 1, sample
             self._check_generated_ids(
@@ -1038,20 +1197,37 @@ def test_lm_head_model_random_beam_search_generate(self):
                 )
             )
             # num_return_sequences > 1, greedy
-            self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2))
+            self._check_generated_ids(
+                model.generate(
+                    input_ids,
+                    do_sample=False,
+                    num_beams=2,
+                    num_return_sequences=2,
+                )
+            )
 
             # check bad words tokens language generation
             # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
+            bad_words_ids = [
+                self._generate_random_bad_tokens(1, model),
+                self._generate_random_bad_tokens(2, model),
+            ]
             output_tokens = model.generate(
-                input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
+                input_ids,
+                do_sample=False,
+                bad_words_ids=bad_words_ids,
+                num_beams=2,
+                num_return_sequences=2,
             )
             # only count generated tokens
             generated_ids = output_tokens[:, input_ids.shape[-1] :]
             self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
 
     def test_lm_head_model_beam_search_generate_dict_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
         if input_ids is None:
             input_ids = inputs_dict.get("input_features", None)
@@ -1086,14 +1262,20 @@ def test_lm_head_model_beam_search_generate_dict_outputs(self):
                 self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput)
 
     def test_loss_computation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config)
             if getattr(model, "hf_compute_loss", None):
                 # The number of elements in the loss should be the same as the number of elements in the label
                 prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
                 added_label = prepared_for_class[
-                    sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]
+                    sorted(
+                        list(prepared_for_class.keys() - inputs_dict.keys()),
+                        reverse=True,
+                    )[0]
                 ]
                 loss_size = tf.size(added_label)
 
@@ -1104,7 +1286,11 @@ def test_loss_computation(self):
 
                 # Test that model correctly compute the loss with kwargs
                 prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                possible_input_names = {"input_ids", "pixel_values", "input_features"}
+                possible_input_names = {
+                    "input_ids",
+                    "pixel_values",
+                    "input_features",
+                }
                 input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
                 model_input = prepared_for_class.pop(input_name)
 
@@ -1148,8 +1334,15 @@ def test_loss_computation(self):
                 self.assertEqual(loss.shape, [loss_size])
 
     def test_generate_with_headmasking(self):
-        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        attention_names = [
+            "encoder_attentions",
+            "decoder_attentions",
+            "cross_attentions",
+        ]
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_generative_model_classes:
             model = model_class(config)
@@ -1184,7 +1377,10 @@ def test_generate_with_headmasking(self):
     def test_load_with_mismatched_shapes(self):
         if not self.test_mismatched_shapes:
             return
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
@@ -1291,7 +1487,13 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
 def random_attention_mask(shape, rng=None, name=None, dtype=None):
     attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype)
     # make sure that at least one token is attended to for each batch
-    attn_mask = tf.concat([tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), attn_mask[:, 1:]], axis=1)
+    attn_mask = tf.concat(
+        [
+            tf.constant(value=1, shape=(shape[0], 1), dtype=dtype),
+            attn_mask[:, 1:],
+        ],
+        axis=1,
+    )
     return attn_mask
 
 
@@ -1308,7 +1510,10 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
     for _ in range(total_dims):
         values.append(rng.random() * scale)
 
-    return tf.reshape(tf.constant(values, dtype=dtype if dtype is not None else tf.float32), shape=shape)
+    return tf.reshape(
+        tf.constant(values, dtype=dtype if dtype is not None else tf.float32),
+        shape=shape,
+    )
 
 
 @require_tf
@@ -1387,12 +1592,34 @@ def test_top_k_top_p_filtering(self):
         )
 
         non_inf_expected_idx = tf.convert_to_tensor(
-            [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]],
+            [
+                [0, 0],
+                [0, 9],
+                [0, 10],
+                [0, 25],
+                [0, 26],
+                [1, 13],
+                [1, 17],
+                [1, 18],
+                [1, 20],
+                [1, 27],
+            ],
             dtype=tf.int32,
         )  # expected non filtered idx as noted above
 
         non_inf_expected_output = tf.convert_to_tensor(
-            [8.222099, 7.3534126, 8.432078, 7.4402075, 9.38451, 6.271159, 8.827531, 5.4402995, 7.3857956, 9.677023],
+            [
+                8.222099,
+                7.3534126,
+                8.432078,
+                7.4402075,
+                9.38451,
+                6.271159,
+                8.827531,
+                5.4402995,
+                7.3857956,
+                9.677023,
+            ],
             dtype=tf.float32,
         )  # expected non filtered values as noted above
 
@@ -1423,19 +1650,31 @@ def tearDownClass(cls):
             pass
 
         try:
-            delete_repo(token=cls._token, name="test-model-tf-org", organization="valid_org")
+            delete_repo(
+                token=cls._token,
+                name="test-model-tf-org",
+                organization="valid_org",
+            )
         except HTTPError:
             pass
 
     def test_push_to_hub(self):
         config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
         )
         model = TFBertModel(config)
         # Make sure model is properly initialized
         _ = model(model.dummy_inputs)
         with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, "test-model-tf"), push_to_hub=True, use_auth_token=self._token)
+            model.save_pretrained(
+                os.path.join(tmp_dir, "test-model-tf"),
+                push_to_hub=True,
+                use_auth_token=self._token,
+            )
 
             new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
             models_equal = True
@@ -1446,7 +1685,11 @@ def test_push_to_hub(self):
 
     def test_push_to_hub_with_model_card(self):
         config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
         )
         model = TFBertModel(config)
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -1455,7 +1698,11 @@ def test_push_to_hub_with_model_card(self):
 
     def test_push_to_hub_in_organization(self):
         config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
         )
         model = TFBertModel(config)
         with tempfile.TemporaryDirectory() as tmp_dir:
diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py
index 233ec6662b820..6f8c142b654d8 100644
--- a/tests/test_modeling_tf_convnext.py
+++ b/tests/test_modeling_tf_convnext.py
@@ -16,6 +16,7 @@
 
 import inspect
 import unittest
+from typing import List, Tuple
 
 from transformers import ConvNextConfig
 from transformers.file_utils import cached_property, is_tf_available, is_vision_available
@@ -222,6 +223,50 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    # Since ConvNext does not have any attention we need to rewrite this test.
+    def test_model_outputs_equivalence(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
+            dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+            def recursive_check(tuple_object, dict_object):
+                if isinstance(tuple_object, (List, Tuple)):
+                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                        recursive_check(tuple_iterable_value, dict_iterable_value)
+                elif tuple_object is None:
+                    return
+                else:
+                    self.assertTrue(
+                        all(tf.equal(tuple_object, dict_object)),
+                        msg=f"Tuple and dict output are not equal. Difference: {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}",
+                    )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
     def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)

From 7dcd98a346e91803660d1bb3ee0f4f6b8bb28bd2 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:18:41 +0530
Subject: [PATCH 45/65] rebasing

---
 playground.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 playground.py

diff --git a/playground.py b/playground.py
new file mode 100644
index 0000000000000..8a53d5babd2be
--- /dev/null
+++ b/playground.py
@@ -0,0 +1,38 @@
+import tensorflow as tf
+from transformers import AutoFeatureExtractor
+
+# import your TFConvNextForImageClassification class here, we will take care
+# of adding the boilerplate to run `from transformers import
+# TFConvNextForImageClassification` later
+from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
+from transformers import ConvNextForImageClassification
+
+from PIL import Image
+
+# model = ConvNextForImageClassification.from_pretrained(
+#     "facebook/convnext-tiny-224",
+# )
+# print(f"Model State Dict:\n")
+# all_keys = list(model.state_dict().keys())
+# print([k for k in all_keys if "layer_scale" in k])
+
+model = TFConvNextForImageClassification.from_pretrained(
+    "facebook/convnext-tiny-224",
+    from_pt=True,
+)  # notice the `from_pt` argument
+print(model.summary(expand_nested=True))
+
+
+feature_extractor = AutoFeatureExtractor.from_pretrained(
+    "facebook/convnext-tiny-224"
+)  # don't know if this is supposed to work with TF as well, change this as needed
+
+image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
+inputs = feature_extractor(images=image, return_tensors="tf")
+
+# forward pass
+outputs = model(**inputs)
+
+# verify the logits
+assert outputs.logits.shape == [1, 1000]
+tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From 95fffedb65595409a2cc12e25bdb876985bc8452 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:19:23 +0530
Subject: [PATCH 46/65] rebasing and removing playground.py.

---
 playground.py | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 playground.py

diff --git a/playground.py b/playground.py
deleted file mode 100644
index 8a53d5babd2be..0000000000000
--- a/playground.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import tensorflow as tf
-from transformers import AutoFeatureExtractor
-
-# import your TFConvNextForImageClassification class here, we will take care
-# of adding the boilerplate to run `from transformers import
-# TFConvNextForImageClassification` later
-from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
-from transformers import ConvNextForImageClassification
-
-from PIL import Image
-
-# model = ConvNextForImageClassification.from_pretrained(
-#     "facebook/convnext-tiny-224",
-# )
-# print(f"Model State Dict:\n")
-# all_keys = list(model.state_dict().keys())
-# print([k for k in all_keys if "layer_scale" in k])
-
-model = TFConvNextForImageClassification.from_pretrained(
-    "facebook/convnext-tiny-224",
-    from_pt=True,
-)  # notice the `from_pt` argument
-print(model.summary(expand_nested=True))
-
-
-feature_extractor = AutoFeatureExtractor.from_pretrained(
-    "facebook/convnext-tiny-224"
-)  # don't know if this is supposed to work with TF as well, change this as needed
-
-image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
-inputs = feature_extractor(images=image, return_tensors="tf")
-
-# forward pass
-outputs = model(**inputs)
-
-# verify the logits
-assert outputs.logits.shape == [1, 1000]
-tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From f8129a118ff2fe8a6e46f3cd3e56182ad718deb4 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:18:41 +0530
Subject: [PATCH 47/65] rebasing

---
 playground.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 playground.py

diff --git a/playground.py b/playground.py
new file mode 100644
index 0000000000000..8a53d5babd2be
--- /dev/null
+++ b/playground.py
@@ -0,0 +1,38 @@
+import tensorflow as tf
+from transformers import AutoFeatureExtractor
+
+# import your TFConvNextForImageClassification class here, we will take care
+# of adding the boilerplate to run `from transformers import
+# TFConvNextForImageClassification` later
+from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
+from transformers import ConvNextForImageClassification
+
+from PIL import Image
+
+# model = ConvNextForImageClassification.from_pretrained(
+#     "facebook/convnext-tiny-224",
+# )
+# print(f"Model State Dict:\n")
+# all_keys = list(model.state_dict().keys())
+# print([k for k in all_keys if "layer_scale" in k])
+
+model = TFConvNextForImageClassification.from_pretrained(
+    "facebook/convnext-tiny-224",
+    from_pt=True,
+)  # notice the `from_pt` argument
+print(model.summary(expand_nested=True))
+
+
+feature_extractor = AutoFeatureExtractor.from_pretrained(
+    "facebook/convnext-tiny-224"
+)  # don't know if this is supposed to work with TF as well, change this as needed
+
+image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
+inputs = feature_extractor(images=image, return_tensors="tf")
+
+# forward pass
+outputs = model(**inputs)
+
+# verify the logits
+assert outputs.logits.shape == [1, 1000]
+tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From dab6866746a923e87fe3bf8d0b19b10ad28425ed Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 13 Feb 2022 17:19:23 +0530
Subject: [PATCH 48/65] rebasing and removing playground.py.

---
 playground.py | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 playground.py

diff --git a/playground.py b/playground.py
deleted file mode 100644
index 8a53d5babd2be..0000000000000
--- a/playground.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import tensorflow as tf
-from transformers import AutoFeatureExtractor
-
-# import your TFConvNextForImageClassification class here, we will take care
-# of adding the boilerplate to run `from transformers import
-# TFConvNextForImageClassification` later
-from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification
-from transformers import ConvNextForImageClassification
-
-from PIL import Image
-
-# model = ConvNextForImageClassification.from_pretrained(
-#     "facebook/convnext-tiny-224",
-# )
-# print(f"Model State Dict:\n")
-# all_keys = list(model.state_dict().keys())
-# print([k for k in all_keys if "layer_scale" in k])
-
-model = TFConvNextForImageClassification.from_pretrained(
-    "facebook/convnext-tiny-224",
-    from_pt=True,
-)  # notice the `from_pt` argument
-print(model.summary(expand_nested=True))
-
-
-feature_extractor = AutoFeatureExtractor.from_pretrained(
-    "facebook/convnext-tiny-224"
-)  # don't know if this is supposed to work with TF as well, change this as needed
-
-image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png")  # you might need to change the relative path
-inputs = feature_extractor(images=image, return_tensors="tf")
-
-# forward pass
-outputs = model(**inputs)
-
-# verify the logits
-assert outputs.logits.shape == [1, 1000]
-tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4)

From 69b541393442f188c6fc2f9026278e1a812b9ff2 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 24 Feb 2022 12:30:18 +0530
Subject: [PATCH 49/65] chore: moved convnext test to the correct location

---
 tests/{ => convnext}/test_modeling_tf_convnext.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/{ => convnext}/test_modeling_tf_convnext.py (100%)

diff --git a/tests/test_modeling_tf_convnext.py b/tests/convnext/test_modeling_tf_convnext.py
similarity index 100%
rename from tests/test_modeling_tf_convnext.py
rename to tests/convnext/test_modeling_tf_convnext.py

From 15c6814e322420e7b36c35ca346f53f3c4e8b44e Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 24 Feb 2022 12:38:28 +0530
Subject: [PATCH 50/65] fix: locations for the test file of convnext.

---
 .../models/convnext/modeling_tf_convnext.py   | 2205 ++++++++++++-----
 tests/convnext/test_modeling_tf_convnext.py   |   45 +
 tests/test_modeling_tf_common.py              |  419 +++-
 3 files changed, 2031 insertions(+), 638 deletions(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 328194dddbc2c..2038f29e56cf8 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Meta Platforms Inc. and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2019 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,609 +12,1710 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 ConvNext model."""
 
 
-from typing import Dict, Optional, Tuple, Union
-
-import numpy as np
-import tensorflow as tf
-
-from ...activations_tf import get_tf_activation
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
-from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling, TFSequenceClassifierOutput
-from ...modeling_tf_utils import (
-    TFModelInputType,
-    TFPreTrainedModel,
-    TFSequenceClassificationLoss,
-    get_initializer,
-    input_processing,
-    keras_serializable,
+import copy
+import inspect
+import json
+import os
+import random
+import tempfile
+import unittest
+from importlib import import_module
+from typing import List, Tuple
+
+from huggingface_hub import delete_repo, login
+from requests.exceptions import HTTPError
+from transformers import is_tf_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import tooslow  # noqa: F401
+from transformers.testing_utils import (
+    PASS,
+    USER,
+    CaptureLogger,
+    _tf_gpu_memory_limit,
+    is_pt_tf_cross_test,
+    is_staging_test,
+    require_tf,
+    require_tf2onnx,
+    slow,
 )
-from ...utils import logging
-from .configuration_convnext import ConvNextConfig
-
+from transformers.utils import logging
+
+
+if is_tf_available():
+    import numpy as np
+    import tensorflow as tf
+
+    from transformers import (
+        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+        TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+        TF_MODEL_FOR_MASKED_LM_MAPPING,
+        TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
+        TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
+        TF_MODEL_FOR_PRETRAINING_MAPPING,
+        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+        TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
+        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        BertConfig,
+        TFAutoModel,
+        TFAutoModelForSequenceClassification,
+        TFBertModel,
+        TFSharedEmbeddings,
+        tf_top_k_top_p_filtering,
+    )
+    from transformers.generation_tf_utils import (
+        TFBeamSampleDecoderOnlyOutput,
+        TFBeamSampleEncoderDecoderOutput,
+        TFBeamSearchDecoderOnlyOutput,
+        TFBeamSearchEncoderDecoderOutput,
+        TFGreedySearchDecoderOnlyOutput,
+        TFGreedySearchEncoderDecoderOutput,
+        TFSampleDecoderOnlyOutput,
+        TFSampleEncoderDecoderOutput,
+    )
 
-logger = logging.get_logger(__name__)
+    if _tf_gpu_memory_limit is not None:
+        gpus = tf.config.list_physical_devices("GPU")
+        for gpu in gpus:
+            # Restrict TensorFlow to only allocate x GB of memory on the GPUs
+            try:
+                tf.config.set_logical_device_configuration(
+                    gpu,
+                    [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)],
+                )
+                logical_gpus = tf.config.list_logical_devices("GPU")
+                print("Logical GPUs", logical_gpus)
+            except RuntimeError as e:
+                # Virtual devices must be set before GPUs have been initialized
+                print(e)
+
+
+def _config_zero_init(config):
+    configs_no_init = copy.deepcopy(config)
+    for key in configs_no_init.__dict__.keys():
+        if "_range" in key or "_std" in key:
+            setattr(configs_no_init, key, 0.0)
+    return configs_no_init
+
+
+@require_tf
+class TFModelTesterMixin:
+
+    model_tester = None
+    all_model_classes = ()
+    all_generative_model_classes = ()
+    test_mismatched_shapes = True
+    test_resize_embeddings = True
+    test_head_masking = True
+    is_encoder_decoder = False
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
+        inputs_dict = copy.deepcopy(inputs_dict)
+
+        if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+            inputs_dict = {
+                k: tf.tile(
+                    tf.expand_dims(v, 1),
+                    (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1),
+                )
+                if isinstance(v, tf.Tensor) and v.ndim > 0
+                else v
+                for k, v in inputs_dict.items()
+            }
 
+        if return_labels:
+            if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+                inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
+            elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+                inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+                inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+            elif model_class in [
+                *get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
+                *get_values(TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
+            ]:
+                inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+            elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING):
+                inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+            elif model_class in [
+                *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
+                *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING),
+                *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
+                *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
+                *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
+                *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING),
+            ]:
+                inputs_dict["labels"] = tf.zeros(
+                    (
+                        self.model_tester.batch_size,
+                        self.model_tester.seq_length,
+                    ),
+                    dtype=tf.int32,
+                )
+        return inputs_dict
+
+    def test_initialization(self):
+        pass
+
+    def test_save_load(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname, saved_model=False)
+                model = model_class.from_pretrained(tmpdirname)
+                after_outputs = model(self._prepare_for_class(inputs_dict, model_class))
+
+                self.assert_outputs_same(after_outputs, outputs)
+
+    def test_save_load_config(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            model_config = model.get_config()
+            # make sure that returned config is jsonifiable, which is required by keras
+            json.dumps(model_config)
+            new_model = model_class.from_config(model.get_config())
+            # make sure it also accepts a normal config
+            _ = model_class.from_config(model.config)
+            _ = new_model(self._prepare_for_class(inputs_dict, model_class))  # Build model
+            new_model.set_weights(model.get_weights())
+            after_outputs = new_model(self._prepare_for_class(inputs_dict, model_class))
+
+            self.assert_outputs_same(after_outputs, outputs)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model.config.is_encoder_decoder:
+                expected_arg_names = [
+                    "input_ids",
+                    "attention_mask",
+                    "decoder_input_ids",
+                    "decoder_attention_mask",
+                ]
+                expected_arg_names.extend(
+                    ["head_mask", "decoder_head_mask"] if "head_mask" and "decoder_head_mask" in arg_names else []
+                )
+                # Necessary to handle BART with newly added cross_attn_head_mask
+                expected_arg_names.extend(
+                    ["cross_attn_head_mask", "encoder_outputs"]
+                    if "cross_attn_head_mask" in arg_names
+                    else ["encoder_outputs"]
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+            else:
+                expected_arg_names = ["input_ids"]
+                self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_onnx_compliancy(self):
+        if not self.test_onnx:
+            return
+
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        INTERNAL_OPS = [
+            "Assert",
+            "AssignVariableOp",
+            "EmptyTensorList",
+            "ReadVariableOp",
+            "ResourceGather",
+            "TruncatedNormal",
+            "VarHandleOp",
+            "VarIsInitializedOp",
+        ]
+        onnx_ops = []
 
-_CONFIG_FOR_DOC = "ConvNextConfig"
-_CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224"
+        with open(os.path.join(".", "utils", "tf_ops", "onnx.json")) as f:
+            onnx_opsets = json.load(f)["opsets"]
 
+        for i in range(1, self.onnx_min_opset + 1):
+            onnx_ops.extend(onnx_opsets[str(i)])
 
-class TFConvNextDropPath(tf.keras.layers.Layer):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-    References:
-        (1) github.com:rwightman/pytorch-image-models
-    """
+        for model_class in self.all_model_classes:
+            model_op_names = set()
 
-    def __init__(self, drop_path, **kwargs):
-        super().__init__(**kwargs)
-        self.drop_path = drop_path
+            with tf.Graph().as_default() as g:
+                model = model_class(config)
+                model(model.dummy_inputs)
 
-    def call(self, x, training=None):
-        if training:
-            keep_prob = 1 - self.drop_path
-            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
-            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
-            random_tensor = tf.floor(random_tensor)
-            return (x / keep_prob) * random_tensor
-        return x
+                for op in g.get_operations():
+                    model_op_names.add(op.node_def.op)
 
+            model_op_names = sorted(model_op_names)
+            incompatible_ops = []
 
-class TFConvNextEmbeddings(tf.keras.layers.Layer):
-    """This class is comparable to (and inspired by) the SwinEmbeddings class
-    found in src/transformers/models/swin/modeling_swin.py.
-    """
+            for op in model_op_names:
+                if op not in onnx_ops and op not in INTERNAL_OPS:
+                    incompatible_ops.append(op)
 
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.patch_embeddings = tf.keras.layers.Conv2D(
-            filters=config.hidden_sizes[0],
-            kernel_size=config.patch_size,
-            strides=config.patch_size,
-            name="patch_embeddings",
-            kernel_initializer=get_initializer(config.initializer_range),
-            bias_initializer="zeros",
-        )
-        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
-
-    def call(self, pixel_values):
-        if isinstance(pixel_values, dict):
-            pixel_values = pixel_values["pixel_values"]
-
-        # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format.
-        # So change the input format from `NCHW` to `NHWC`.
-        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
-        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
-
-        embeddings = self.patch_embeddings(pixel_values)
-        embeddings = self.layernorm(embeddings)
-        return embeddings
-
-
-class TFConvNextLayer(tf.keras.layers.Layer):
-    """This corresponds to the `Block` class in the original implementation.
-
-    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
-    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back
-
-    The authors used (2) as they find it slightly faster in PyTorch. Since we already permuted the inputs to follow
-    NHWC ordering, we can just apply the operations straight-away without the permutation.
-
-    Args:
-        config ([`ConvNextConfig`]): Model configuration class.
-        dim (`int`): Number of input channels.
-        drop_path (`float`): Stochastic depth rate. Default: 0.0.
-    """
-
-    def __init__(self, config, dim, drop_path=0.0, **kwargs):
-        super().__init__(**kwargs)
-        self.dim = dim
-        self.config = config
-        self.dwconv = tf.keras.layers.Conv2D(
-            filters=dim,
-            kernel_size=7,
-            padding="same",
-            groups=dim,
-            kernel_initializer=get_initializer(config.initializer_range),
-            bias_initializer="zeros",
-            name="dwconv",
-        )  # depthwise conv
-        self.layernorm = tf.keras.layers.LayerNormalization(
-            epsilon=1e-6,
-            name="layernorm",
-        )
-        self.pwconv1 = tf.keras.layers.Dense(
-            units=4 * dim,
-            kernel_initializer=get_initializer(config.initializer_range),
-            bias_initializer="zeros",
-            name="pwconv1",
-        )  # pointwise/1x1 convs, implemented with linear layers
-        self.act = get_tf_activation(config.hidden_act)
-        self.pwconv2 = tf.keras.layers.Dense(
-            units=dim,
-            kernel_initializer=get_initializer(config.initializer_range),
-            bias_initializer="zeros",
-            name="pwconv2",
-        )
-        # Using `layers.Activation` instead of `tf.identity` to better control `training`
-        # behaviour.
-        self.drop_path = (
-            TFConvNextDropPath(
-                drop_path,
-                name="drop_path",
-            )
-            if drop_path > 0.0
-            else tf.keras.layers.Activation(
-                "linear",
-                name="drop_path",
-            )
-        )
+            self.assertEqual(len(incompatible_ops), 0, incompatible_ops)
 
-    def build(self, input_shape: tf.TensorShape):
-        # PT's `nn.Parameters` must be mapped to a TF layer weight to inherit the same name hierarchy (and vice-versa)
-        self.layer_scale_parameter = (
-            self.add_weight(
-                shape=(self.dim,),
-                initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
-                trainable=True,
-                name="layer_scale_parameter",
-            )
-            if self.config.layer_scale_init_value > 0
-            else None
-        )
-        super().build(input_shape)
-
-    def call(self, hidden_states, training=False):
-        input = hidden_states
-        x = self.dwconv(hidden_states)
-        x = self.layernorm(x)
-        x = self.pwconv1(x)
-        x = self.act(x)
-        x = self.pwconv2(x)
-
-        if self.layer_scale_parameter is not None:
-            x = self.layer_scale_parameter * x
-
-        x = input + self.drop_path(x, training=training)
-        return x
-
-
-class TFConvNextStage(tf.keras.layers.Layer):
-    """ConvNext stage, consisting of an optional downsampling layer + multiple residual blocks.
-
-    Args:
-        config ([`ConvNextConfig`]): Model configuration class.
-        in_channels (`int`): Number of input channels.
-        out_channels (`int`): Number of output channels.
-        depth (`int`): Number of residual blocks.
-        drop_path_rates(`List[float]`): Stochastic depth rates for each layer.
-    """
-
-    def __init__(
-        self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs
-    ):
-        super().__init__(**kwargs)
-        if in_channels != out_channels or stride > 1:
-            self.downsampling_layer = [
-                tf.keras.layers.LayerNormalization(
-                    epsilon=1e-6,
-                    name="downsampling_layer.0",
-                ),
-                # Inputs to this layer will follow NHWC format since we
-                # transposed the inputs from NCHW to NHWC in the `TFConvNextEmbeddings`
-                # layer. All the outputs throughout the model will be in NHWC
-                # from this point on until the output where we again change to
-                # NCHW.
-                tf.keras.layers.Conv2D(
-                    filters=out_channels,
-                    kernel_size=kernel_size,
-                    strides=stride,
-                    kernel_initializer=get_initializer(config.initializer_range),
-                    bias_initializer="zeros",
-                    name="downsampling_layer.1",
-                ),
-            ]
-        else:
-            self.downsampling_layer = [tf.identity]
-
-        drop_path_rates = drop_path_rates or [0.0] * depth
-        self.layers = [
-            TFConvNextLayer(
-                config,
-                dim=out_channels,
-                drop_path=drop_path_rates[j],
-                name=f"layers.{j}",
-            )
-            for j in range(depth)
-        ]
+    @require_tf2onnx
+    @slow
+    def test_onnx_runtime_optimize(self):
+        if not self.test_onnx:
+            return
 
-    def call(self, hidden_states):
-        for layer in self.downsampling_layer:
-            hidden_states = layer(hidden_states)
-        for layer in self.layers:
-            hidden_states = layer(hidden_states)
-        return hidden_states
-
-
-class TFConvNextEncoder(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.stages = []
-        drop_path_rates = [x for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))]
-        cur = 0
-        prev_chs = config.hidden_sizes[0]
-        for i in range(config.num_stages):
-            out_chs = config.hidden_sizes[i]
-            stage = TFConvNextStage(
-                config,
-                in_channels=prev_chs,
-                out_channels=out_chs,
-                stride=2 if i > 0 else 1,
-                depth=config.depths[i],
-                drop_path_rates=drop_path_rates[cur],
-                name=f"stages.{i}",
-            )
-            self.stages.append(stage)
-            cur += config.depths[i]
-            prev_chs = out_chs
+        import onnxruntime
+        import tf2onnx
 
-    def call(self, hidden_states, output_hidden_states=False, return_dict=True):
-        all_hidden_states = () if output_hidden_states else None
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
-        for i, layer_module in enumerate(self.stages):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model(model.dummy_inputs)
 
-            hidden_states = layer_module(hidden_states)
+            onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset)
 
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
+            onnxruntime.InferenceSession(onnx_model_proto.SerializeToString())
 
-        if not return_dict:
-            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+    def test_keras_save_load(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
-        return TFBaseModelOutput(
-            last_hidden_state=hidden_states,
-            hidden_states=all_hidden_states,
+        tf_main_layer_classes = set(
+            module_member
+            for model_class in self.all_model_classes
+            for module in (import_module(model_class.__module__),)
+            for module_member_name in dir(module)
+            if module_member_name.endswith("MainLayer")
+            # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
+            and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
+            for module_member in (getattr(module, module_member_name),)
+            if isinstance(module_member, type)
+            and tf.keras.layers.Layer in module_member.__bases__
+            and getattr(module_member, "_keras_serializable", False)
         )
+        for main_layer_class in tf_main_layer_classes:
+            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
+            if "T5" in main_layer_class.__name__:
+                # Take the same values than in TFT5ModelTester for this shared layer
+                shared = TFSharedEmbeddings(99, 32, name="shared")
+                config.use_cache = inputs_dict.pop("use_cache", None)
+                main_layer = main_layer_class(config, embed_tokens=shared)
+            else:
+                main_layer = main_layer_class(config)
+
+            symbolic_inputs = {
+                name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
+            }
 
+            model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
+            outputs = model(inputs_dict)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                filepath = os.path.join(tmpdirname, "keras_model.h5")
+                model.save(filepath)
+                if "T5" in main_layer_class.__name__:
+                    model = tf.keras.models.load_model(
+                        filepath,
+                        custom_objects={
+                            main_layer_class.__name__: main_layer_class,
+                            "TFSharedEmbeddings": TFSharedEmbeddings,
+                        },
+                    )
+                else:
+                    model = tf.keras.models.load_model(
+                        filepath,
+                        custom_objects={main_layer_class.__name__: main_layer_class},
+                    )
+                assert isinstance(model, tf.keras.Model)
+                after_outputs = model(inputs_dict)
+                self.assert_outputs_same(after_outputs, outputs)
+
+    def assert_outputs_same(self, after_outputs, outputs):
+        # Make sure we don't have nans
+        if isinstance(after_outputs, tf.Tensor):
+            out_1 = after_outputs.numpy()
+        elif isinstance(after_outputs, dict):
+            out_1 = after_outputs[list(after_outputs.keys())[0]].numpy()
+        else:
+            out_1 = after_outputs[0].numpy()
+        out_2 = outputs[0].numpy()
+        self.assertEqual(out_1.shape, out_2.shape)
+        out_1 = out_1[~np.isnan(out_1)]
+        out_2 = out_2[~np.isnan(out_2)]
+        max_diff = np.amax(np.abs(out_1 - out_2))
+        self.assertLessEqual(max_diff, 1e-5)
+
+    @is_pt_tf_cross_test
+    def test_pt_tf_model_equivalence(self):
+        import torch
+
+        import transformers
+
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
+            pt_model_class = getattr(transformers, pt_model_class_name)
+
+            config.output_hidden_states = True
+
+            tf_model = model_class(config)
+            pt_model = pt_model_class(config)
+
+            # Check we can load pt model in tf and vice-versa with model => model functions
+            tf_model = transformers.load_pytorch_model_in_tf2_model(
+                tf_model,
+                pt_model,
+                tf_inputs=self._prepare_for_class(inputs_dict, model_class),
+            )
+            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+            pt_inputs_dict = {}
+            for name, key in self._prepare_for_class(inputs_dict, model_class).items():
+                if type(key) == bool:
+                    pt_inputs_dict[name] = key
+                elif name == "input_values":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                elif name == "pixel_values":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                elif name == "input_features":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                else:
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
+
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs_dict)
+            tfo = tf_model(
+                self._prepare_for_class(inputs_dict, model_class),
+                training=False,
+            )
 
-@keras_serializable
-class TFConvNextMainLayer(tf.keras.layers.Layer):
-    config_class = ConvNextConfig
-
-    def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs):
-        super().__init__(**kwargs)
-
-        self.config = config
-        self.embeddings = TFConvNextEmbeddings(config, name="embeddings")
-        self.encoder = TFConvNextEncoder(config, name="encoder")
-        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
-        self.pooler = tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None
-
-    def call(
-        self,
-        pixel_values: Optional[TFModelInputType] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        training: bool = False,
-        **kwargs,
-    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            tf_hidden_states = tfo[0].numpy()
+            pt_hidden_states = pto[0].numpy()
+
+            tf_nans = np.copy(np.isnan(tf_hidden_states))
+            pt_nans = np.copy(np.isnan(pt_hidden_states))
+
+            pt_hidden_states[tf_nans] = 0
+            tf_hidden_states[tf_nans] = 0
+            pt_hidden_states[pt_nans] = 0
+            tf_hidden_states[pt_nans] = 0
+
+            max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
+            self.assertLessEqual(max_diff, 4e-2)
+
+            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
+                torch.save(pt_model.state_dict(), pt_checkpoint_path)
+                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
+
+                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
+                tf_model.save_weights(tf_checkpoint_path)
+                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
+
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+            pt_inputs_dict = {}
+            for name, key in self._prepare_for_class(inputs_dict, model_class).items():
+                if type(key) == bool:
+                    key = np.array(key, dtype=bool)
+                    pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long)
+                elif name == "input_values":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                elif name == "pixel_values":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                elif name == "input_features":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                else:
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
+
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs_dict)
+            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class))
+            tfo = tfo[0].numpy()
+            pto = pto[0].numpy()
+            tf_nans = np.copy(np.isnan(tfo))
+            pt_nans = np.copy(np.isnan(pto))
+
+            pto[tf_nans] = 0
+            tfo[tf_nans] = 0
+            pto[pt_nans] = 0
+            tfo[pt_nans] = 0
+
+            max_diff = np.amax(np.abs(tfo - pto))
+            self.assertLessEqual(max_diff, 4e-2)
+
+    def test_compile_tf_model(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        max_input = getattr(self.model_tester, "max_position_embeddings", 512)
+        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
+
+        for model_class in self.all_model_classes:
+            if model_class.__name__ in [
+                "TFSpeech2TextModel",
+                "TFSpeech2TextForConditionalGeneration",
+            ]:
+                inputs = {
+                    "decoder_input_ids": tf.keras.Input(
+                        batch_shape=(2, max_input),
+                        name="decoder_input_ids",
+                        dtype="int32",
+                    ),
+                    "input_features": tf.keras.Input(
+                        batch_shape=(
+                            2,
+                            max_input,
+                            self.model_tester.input_feat_per_channel * self.model_tester.input_channels,
+                        ),
+                        name="input_features",
+                        dtype="float32",
+                    ),
+                }
+            elif self.is_encoder_decoder:
+                inputs = {
+                    "decoder_input_ids": tf.keras.Input(
+                        batch_shape=(2, max_input),
+                        name="decoder_input_ids",
+                        dtype="int32",
+                    ),
+                    "input_ids": tf.keras.Input(
+                        batch_shape=(2, max_input),
+                        name="input_ids",
+                        dtype="int32",
+                    ),
+                }
+            # `pixel_values` implies that the input is an image
+            elif model_class.main_input_name == "pixel_values":
+                inputs = tf.keras.Input(
+                    batch_shape=(
+                        3,
+                        self.model_tester.num_channels,
+                        self.model_tester.image_size,
+                        self.model_tester.image_size,
+                    ),
+                    name="pixel_values",
+                    dtype="float32",
+                )
+            elif model_class.__name__ in ["TFCLIPModel"]:
+                inputs = {
+                    "input_ids": tf.keras.Input(
+                        batch_shape=(3, max_input),
+                        name="input_ids",
+                        dtype="int32",
+                    ),
+                    "pixel_values": tf.keras.Input(
+                        batch_shape=(
+                            3,
+                            self.model_tester.vision_model_tester.num_channels,
+                            self.model_tester.vision_model_tester.image_size,
+                            self.model_tester.vision_model_tester.image_size,
+                        ),
+                        name="pixel_values",
+                        dtype="float32",
+                    ),
+                }
+            elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+                inputs = tf.keras.Input(
+                    batch_shape=(4, 2, max_input),
+                    name="input_ids",
+                    dtype="int32",
+                )
+            else:
+                inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
+
+            # Prepare our model
+            model = model_class(config)
+            model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
+            # Let's load it from the disk to be sure we can use pretrained weights
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname, saved_model=False)
+                model = model_class.from_pretrained(tmpdirname)
+
+            outputs_dict = model(inputs)
+            hidden_states = outputs_dict[0]
+
+            # Add a dense layer on top to test integration with other keras modules
+            outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
+
+            # Compile extended model
+            extended_model = tf.keras.Model(inputs=[inputs], outputs=[outputs])
+            extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+
+    def test_keyword_and_dict_args(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            outputs_dict = model(inputs)
+
+            inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+            outputs_keywords = model(**inputs_keywords)
+            output_dict = outputs_dict[0].numpy()
+            output_keywords = outputs_keywords[0].numpy()
+
+            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
+
+    def test_attention_outputs(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+        decoder_seq_length = getattr(
+            self.model_tester,
+            "decoder_seq_length",
+            self.model_tester.seq_length,
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        inputs = input_processing(
-            func=self.call,
-            config=self.config,
-            input_ids=pixel_values,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            training=training,
-            kwargs_call=kwargs,
+        encoder_seq_length = getattr(
+            self.model_tester,
+            "encoder_seq_length",
+            self.model_tester.seq_length,
         )
+        decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+
+        def check_decoder_attentions_output(outputs):
+            out_len = len(outputs)
+            self.assertEqual(min(out_len % 2, out_len % 5), 0)  # differentiation due to newly added cross_attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    decoder_seq_length,
+                    decoder_key_length,
+                ],
+            )
 
-        if "input_ids" in inputs:
-            inputs["pixel_values"] = inputs.pop("input_ids")
+        def check_encoder_attentions_output(outputs):
+            attentions = [
+                t.numpy() for t in (outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions)
+            ]
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    encoder_seq_length,
+                    encoder_key_length,
+                ],
+            )
 
-        if inputs["pixel_values"] is None:
-            raise ValueError("You have to specify pixel_values")
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["use_cache"] = False
+            config.output_hidden_states = False
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            out_len = len(outputs)
+            self.assertEqual(config.output_hidden_states, False)
+            check_encoder_attentions_output(outputs)
+
+            if self.is_encoder_decoder:
+                model = model_class(config)
+                outputs = model(self._prepare_for_class(inputs_dict, model_class))
+                self.assertEqual(config.output_hidden_states, False)
+                check_decoder_attentions_output(outputs)
+
+            # Check that output attentions can also be changed via the config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            self.assertEqual(config.output_hidden_states, False)
+            check_encoder_attentions_output(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            config.output_hidden_states = True
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+
+            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
+            self.assertEqual(model.config.output_hidden_states, True)
+            check_encoder_attentions_output(outputs)
+
+    def test_headmasking(self):
+        if not self.test_head_masking:
+            return
+
+        random.Random().seed(42)
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        random.Random().seed()
+
+        inputs_dict["output_attentions"] = True
+        config.output_hidden_states = True
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+
+            # Prepare head_mask
+            def prepare_layer_head_mask(i, attention_heads, num_hidden_layers):
+                if i == 0:
+                    return tf.concat(
+                        (
+                            tf.zeros(1, dtype=tf.float32),
+                            tf.ones(attention_heads - 1, dtype=tf.float32),
+                        ),
+                        0,
+                    )
+                elif i == num_hidden_layers - 1:
+                    return tf.concat(
+                        (
+                            tf.zeros(attention_heads - 1, dtype=tf.float32),
+                            tf.ones(1, dtype=tf.float32),
+                        ),
+                        0,
+                    )
+                else:
+                    return tf.ones(attention_heads, dtype=tf.float32)
+
+            head_mask = tf.stack(
+                [
+                    prepare_layer_head_mask(i, config.num_attention_heads, config.num_hidden_layers)
+                    for i in range(config.num_hidden_layers)
+                ],
+                0,
+            )
 
-        embedding_output = self.embeddings(inputs["pixel_values"], training=inputs["training"])
+            inputs = self._prepare_for_class(inputs_dict, model_class).copy()
+            inputs["head_mask"] = head_mask
+            if model.config.is_encoder_decoder:
+                signature = inspect.signature(model.call)
+                arg_names = [*signature.parameters.keys()]
+                if "decoder_head_mask" in arg_names:  # necessary diferentiation because of T5 model
+                    inputs["decoder_head_mask"] = head_mask
+                if "cross_attn_head_mask" in arg_names:
+                    inputs["cross_attn_head_mask"] = head_mask
+
+            outputs = model(**inputs, return_dict=True)
+
+            def check_attentions_validity(attentions):
+                # Remove Nan
+                for t in attentions:
+                    self.assertLess(
+                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(),
+                        (tf.size(t) / 4).numpy(),
+                    )  # Check we don't have more than 25% nans (arbitrary)
+
+                attentions = [
+                    tf.where(tf.math.is_nan(t), 0.0, t) for t in attentions
+                ]  # remove them (the test is less complete)
+
+                self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0)
+                self.assertNotEqual(
+                    tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(),
+                    0.0,
+                )
+                if len(attentions) > 2:  # encoder-decodere models have only 2 layers in each modules
+                    self.assertNotEqual(
+                        tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(),
+                        0.0,
+                    )
+                self.assertAlmostEqual(
+                    tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(),
+                    0.0,
+                )
+                self.assertNotEqual(
+                    tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(),
+                    0.0,
+                )
+
+            if model.config.is_encoder_decoder:
+                check_attentions_validity(outputs.encoder_attentions)
+                check_attentions_validity(outputs.decoder_attentions)
+                if "cross_attn_head_mask" in arg_names:
+                    check_attentions_validity(outputs.cross_attentions)
+            else:
+                check_attentions_validity(outputs.attentions)
+
+    def test_hidden_states_output(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_hidden_states_output(config, inputs_dict, model_class):
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            expected_num_layers = getattr(
+                self.model_tester,
+                "expected_num_hidden_layers",
+                self.model_tester.num_hidden_layers + 1,
+            )
 
-        encoder_outputs = self.encoder(
-            embedding_output,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            training=inputs["training"],
+            if model.config.is_encoder_decoder:
+                encoder_hidden_states = outputs.encoder_hidden_states
+                decoder_hidden_states = outputs.decoder_hidden_states
+
+                self.assertEqual(config.output_attentions, False)
+                self.assertEqual(len(encoder_hidden_states), expected_num_layers)
+                self.assertListEqual(
+                    list(encoder_hidden_states[0].shape[-2:]),
+                    [
+                        self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
+                )
+                self.assertEqual(len(decoder_hidden_states), expected_num_layers)
+                self.assertListEqual(
+                    list(decoder_hidden_states[0].shape[-2:]),
+                    [
+                        self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
+                )
+            else:
+                hidden_states = outputs.hidden_states
+                self.assertEqual(config.output_attentions, False)
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [
+                        self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
+                )
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(config, inputs_dict, model_class)
+
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            check_hidden_states_output(config, inputs_dict, model_class)
+
+    def test_model_common_attributes(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        text_in_text_out_models = (
+            get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
+            + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
+            + get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING)
         )
+        speech_in_text_out_models = get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+            if model_class in text_in_text_out_models:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            elif model_class in speech_in_text_out_models:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert name is None
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_determinism(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            first, second = (
+                model(
+                    self._prepare_for_class(inputs_dict, model_class),
+                    training=False,
+                )[0],
+                model(
+                    self._prepare_for_class(inputs_dict, model_class),
+                    training=False,
+                )[0],
+            )
+            out_1 = first.numpy()
+            out_2 = second.numpy()
+            out_1 = out_1[~np.isnan(out_1)]
+            out_2 = out_2[~np.isnan(out_2)]
+            max_diff = np.amax(np.abs(out_1 - out_2))
+            self.assertLessEqual(max_diff, 1e-5)
+
+    def test_model_outputs_equivalence(self):
+
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
+            dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+            def recursive_check(tuple_object, dict_object):
+                if isinstance(tuple_object, (List, Tuple)):
+                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                        recursive_check(tuple_iterable_value, dict_iterable_value)
+                elif tuple_object is None:
+                    return
+                else:
+                    self.assertTrue(
+                        all(tf.equal(tuple_object, dict_object)),
+                        msg=f"Tuple and dict output are not equal. Difference: {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}",
+                    )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(
+                model,
+                tuple_inputs,
+                dict_inputs,
+                {"output_hidden_states": True, "output_attentions": True},
+            )
 
-        last_hidden_state = encoder_outputs[0]
-        pooled_output = self.layernorm(self.pooler(last_hidden_state))
-
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-        )
+    def test_inputs_embeds(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            inputs = copy.deepcopy(inputs_dict)
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids)
+            else:
+                inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids)
+
+            inputs = self._prepare_for_class(inputs, model_class)
+
+            model(inputs)
+
+    def test_numpy_arrays_inputs(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def prepare_numpy_arrays(inputs_dict):
+            inputs_np_dict = {}
+            for k, v in inputs_dict.items():
+                if tf.is_tensor(v):
+                    inputs_np_dict[k] = v.numpy()
+                else:
+                    inputs_np_dict[k] = np.array(k)
+
+            return inputs_np_dict
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            inputs_np = prepare_numpy_arrays(inputs)
+
+            output_for_dict_input = model(inputs_np)
+            output_for_kw_input = model(**inputs_np)
+            self.assert_outputs_same(output_for_dict_input, output_for_kw_input)
+
+    def test_resize_token_embeddings(self):
+        if not self.test_resize_embeddings:
+            return
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def _get_word_embedding_weight(model, embedding_layer):
+            embeds = getattr(embedding_layer, "weight", None)
+            if embeds is not None:
+                return embeds
+
+            embeds = getattr(embedding_layer, "decoder", None)
+            if embeds is not None:
+                return embeds
+
+            model(model.dummy_inputs)
+
+            embeds = getattr(embedding_layer, "weight", None)
+            if embeds is not None:
+                return embeds
+
+            embeds = getattr(embedding_layer, "decoder", None)
+            if embeds is not None:
+                return embeds
+
+            return None
+
+        for model_class in self.all_model_classes:
+            for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
+                # build the embeddings
+                model = model_class(config=config)
+                old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                old_bias = model.get_bias()
+                old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                # reshape the embeddings
+                model.resize_token_embeddings(size)
+                new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                new_bias = model.get_bias()
+                new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+
+                # check that the resized embeddings size matches the desired size.
+                assert_size = size if size is not None else config.vocab_size
+                self.assertEqual(new_input_embeddings.shape[0], assert_size)
+
+                # check that weights remain the same after resizing
+                models_equal = True
+                for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
+                    if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                        models_equal = False
+                self.assertTrue(models_equal)
+
+                if old_bias is not None and new_bias is not None:
+                    for old_weight, new_weight in zip(old_bias.values(), new_bias.values()):
+                        self.assertEqual(new_weight.shape[0], assert_size)
+
+                        models_equal = True
+                        for p1, p2 in zip(old_weight.value(), new_weight.value()):
+                            if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                                models_equal = False
+                        self.assertTrue(models_equal)
+
+                if old_output_embeddings is not None and new_output_embeddings is not None:
+                    self.assertEqual(new_output_embeddings.shape[0], assert_size)
+                    self.assertEqual(
+                        new_output_embeddings.shape[1],
+                        old_output_embeddings.shape[1],
+                    )
+
+                    models_equal = True
+                    for p1, p2 in zip(
+                        old_output_embeddings.value(),
+                        new_output_embeddings.value(),
+                    ):
+                        if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                            models_equal = False
+                    self.assertTrue(models_equal)
+
+    def test_lm_head_model_random_no_beam_search_generate(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict.get("input_ids", None)
+
+        # iterate over all generative models
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            if config.bos_token_id is None:
+                # if bos token id is not defined model needs input_ids
+                with self.assertRaises(AssertionError):
+                    model.generate(do_sample=True, max_length=5)
+                # num_return_sequences = 1
+                self._check_generated_ids(model.generate(input_ids, do_sample=True))
+            elif model_class.__name__ not in ["TFSpeech2TextForConditionalGeneration"]:
+                # Models with non-text inputs won't work here; num_return_sequences = 1
+                self._check_generated_ids(model.generate(do_sample=True, max_length=5))
+
+            with self.assertRaises(ValueError):
+                # generating multiple sequences when no beam search generation
+                # is not allowed as it would always generate the same sequences
+                model.generate(input_ids, do_sample=False, num_return_sequences=2)
+
+            # num_return_sequences > 1, sample
+            self._check_generated_ids(model.generate(input_ids, do_sample=True, num_return_sequences=2))
+
+            # check bad words tokens language generation
+            # create list of 1-seq bad token and list of 2-seq of bad tokens
+            bad_words_ids = [
+                self._generate_random_bad_tokens(1, model),
+                self._generate_random_bad_tokens(2, model),
+            ]
+            output_tokens = model.generate(
+                input_ids,
+                do_sample=True,
+                bad_words_ids=bad_words_ids,
+                num_return_sequences=2,
+            )
+            # only count generated tokens
+            generated_ids = output_tokens[:, input_ids.shape[-1] :]
+            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
+
+    def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict.get("input_ids", None)
+        if input_ids is None:
+            input_ids = inputs_dict.get("input_features", None)
+
+        # iterate over all generative models
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+            output_greedy = model.generate(
+                input_ids,
+                do_sample=False,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+            output_sample = model.generate(
+                input_ids,
+                do_sample=True,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
 
+            if model.config.is_encoder_decoder:
+                self.assertIsInstance(output_greedy, TFGreedySearchEncoderDecoderOutput)
+                self.assertIsInstance(output_sample, TFSampleEncoderDecoderOutput)
+            else:
+                self.assertIsInstance(output_greedy, TFGreedySearchDecoderOnlyOutput)
+                self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput)
+
+    def test_lm_head_model_random_beam_search_generate(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict.get("input_ids", None)
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            if config.bos_token_id is None:
+                # if bos token id is not defined model needs input_ids, num_return_sequences = 1
+                self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2))
+            else:
+                # num_return_sequences = 1
+                self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2))
+
+            with self.assertRaises(AssertionError):
+                # generating more sequences than having beams leads is not possible
+                model.generate(
+                    input_ids,
+                    do_sample=False,
+                    num_return_sequences=3,
+                    num_beams=2,
+                )
+
+            # num_return_sequences > 1, sample
+            self._check_generated_ids(
+                model.generate(
+                    input_ids,
+                    do_sample=True,
+                    num_beams=2,
+                    num_return_sequences=2,
+                )
+            )
+            # num_return_sequences > 1, greedy
+            self._check_generated_ids(
+                model.generate(
+                    input_ids,
+                    do_sample=False,
+                    num_beams=2,
+                    num_return_sequences=2,
+                )
+            )
 
-class TFConvNextPreTrainedModel(TFPreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = ConvNextConfig
-    base_model_prefix = "convnext"
-    main_input_name = "pixel_values"
-
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(
-                3,
-                self.config.num_channels,
-                self.config.image_size,
-                self.config.image_size,
-            ),
-            dtype=tf.float32,
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
+            # check bad words tokens language generation
+            # create list of 1-seq bad token and list of 2-seq of bad tokens
+            bad_words_ids = [
+                self._generate_random_bad_tokens(1, model),
+                self._generate_random_bad_tokens(2, model),
+            ]
+            output_tokens = model.generate(
+                input_ids,
+                do_sample=False,
+                bad_words_ids=bad_words_ids,
+                num_beams=2,
+                num_return_sequences=2,
+            )
+            # only count generated tokens
+            generated_ids = output_tokens[:, input_ids.shape[-1] :]
+            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
+
+    def test_lm_head_model_beam_search_generate_dict_outputs(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict.get("input_ids", None)
+        if input_ids is None:
+            input_ids = inputs_dict.get("input_features", None)
+
+        # iterate over all generative models
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+            output_beam_search = model.generate(
+                input_ids,
+                num_beams=2,
+                do_sample=False,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+            output_beam_sample = model.generate(
+                input_ids,
+                num_beams=2,
+                do_sample=True,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
 
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
+            if model.config.is_encoder_decoder:
+                self.assertIsInstance(output_beam_search, TFBeamSearchEncoderDecoderOutput)
+                self.assertIsInstance(output_beam_sample, TFBeamSampleEncoderDecoderOutput)
+            else:
+                self.assertIsInstance(output_beam_search, TFBeamSearchDecoderOnlyOutput)
+                self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput)
+
+    def test_loss_computation(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            if getattr(model, "hf_compute_loss", None):
+                # The number of elements in the loss should be the same as the number of elements in the label
+                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+                added_label = prepared_for_class[
+                    sorted(
+                        list(prepared_for_class.keys() - inputs_dict.keys()),
+                        reverse=True,
+                    )[0]
+                ]
+                loss_size = tf.size(added_label)
+
+                if model.__class__ in get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING):
+                    # if loss is causal lm loss, labels are shift, so that one label per batch
+                    # is cut
+                    loss_size = loss_size - self.model_tester.batch_size
+
+                # Test that model correctly compute the loss with kwargs
+                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+                possible_input_names = {
+                    "input_ids",
+                    "pixel_values",
+                    "input_features",
+                }
+                input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
+                model_input = prepared_for_class.pop(input_name)
+
+                loss = model(model_input, **prepared_for_class)[0]
+                self.assertEqual(loss.shape, [loss_size])
+
+                # Test that model correctly compute the loss with a dict
+                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+                loss = model(prepared_for_class)[0]
+                self.assertEqual(loss.shape, [loss_size])
+
+                # Test that model correctly compute the loss with a tuple
+                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+
+                # Get keys that were added with the _prepare_for_class function
+                label_keys = prepared_for_class.keys() - inputs_dict.keys()
+                signature = inspect.signature(model.call).parameters
+                signature_names = list(signature.keys())
+
+                # Create a dictionary holding the location of the tensors in the tuple
+                tuple_index_mapping = {0: input_name}
+                for label_key in label_keys:
+                    label_key_index = signature_names.index(label_key)
+                    tuple_index_mapping[label_key_index] = label_key
+                sorted_tuple_index_mapping = sorted(tuple_index_mapping.items())
+                # Initialize a list with their default values, update the values and convert to a tuple
+                list_input = []
+
+                for name in signature_names:
+                    if name != "kwargs":
+                        list_input.append(signature[name].default)
+
+                for index, value in sorted_tuple_index_mapping:
+                    list_input[index] = prepared_for_class[value]
+
+                tuple_input = tuple(list_input)
+
+                # Send to model
+                loss = model(tuple_input[:-1])[0]
+
+                self.assertEqual(loss.shape, [loss_size])
+
+    def test_generate_with_headmasking(self):
+        attention_names = [
+            "encoder_attentions",
+            "decoder_attentions",
+            "cross_attentions",
         ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        return self.call(inputs)
-
-
-CONVNEXT_START_DOCSTRING = r"""
-    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
-    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
-    behavior.
-
-    <Tip>
-
-    TF 2.0 models accepts two formats as inputs:
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            # We want to test only encoder-decoder models
+            if not config.is_encoder_decoder:
+                continue
+
+            head_masking = {
+                "head_mask": tf.zeros((config.encoder_layers, config.encoder_attention_heads)),
+                "decoder_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
+                "cross_attn_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
+            }
 
-    - having all inputs as keyword arguments (like PyTorch models), or
-    - having all inputs as a list, tuple or dict in the first positional arguments.
+            signature = inspect.signature(model.call)
+            if set(head_masking.keys()) < set([*signature.parameters.keys()]):
+                continue
+
+            for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
+                out = model.generate(
+                    inputs_dict["input_ids"],
+                    num_beams=1,
+                    max_length=inputs_dict["input_ids"] + 5,
+                    output_attentions=True,
+                    return_dict_in_generate=True,
+                    **{name: mask},
+                )
+                # We check the state of decoder_attentions and cross_attentions just from the last step
+                attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
+                self.assertEqual(sum([tf.reduce_sum(w).numpy() for w in attn_weights]), 0.0)
+
+    def test_load_with_mismatched_shapes(self):
+        if not self.test_mismatched_shapes:
+            return
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
+                continue
+
+            with self.subTest(msg=f"Testing {model_class}"):
+                with tempfile.TemporaryDirectory() as tmp_dir:
+                    model = model_class(config)
+                    inputs = self._prepare_for_class(inputs_dict, model_class)
+                    _ = model(**inputs)
+                    model.save_pretrained(tmp_dir)
+
+                    # Fails when we don't set ignore_mismatched_sizes=True
+                    with self.assertRaises(ValueError):
+                        new_model = TFAutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42)
+                    with self.assertRaises(ValueError):
+                        new_model_without_prefix = TFAutoModel.from_pretrained(tmp_dir, vocab_size=10)
+
+                    logger = logging.get_logger("transformers.modeling_tf_utils")
+                    with CaptureLogger(logger) as cl:
+                        new_model = TFAutoModelForSequenceClassification.from_pretrained(
+                            tmp_dir, num_labels=42, ignore_mismatched_sizes=True
+                        )
+                    self.assertIn("the shapes did not match", cl.out)
+
+                    logits = new_model(**inputs).logits
+                    self.assertEqual(logits.shape[1], 42)
+
+                    with CaptureLogger(logger) as cl:
+                        new_model_without_prefix = TFAutoModel.from_pretrained(
+                            tmp_dir, vocab_size=10, ignore_mismatched_sizes=True
+                        )
+                    self.assertIn("the shapes did not match", cl.out)
+
+                    # Although Tf models always have a prefix pointing to `MainLayer`,
+                    # we still add this "without prefix" test to keep a consistency between tf and pt tests.
+                    input_ids = ids_tensor((2, 8), 10)
+                    if self.is_encoder_decoder:
+                        new_model_without_prefix(input_ids, decoder_input_ids=input_ids)
+                    else:
+                        new_model_without_prefix(input_ids)
+
+    def test_model_main_input_name(self):
+        for model_class in self.all_model_classes:
+            model_signature = inspect.signature(getattr(model_class, "call"))
+            # The main input is the name of the argument after `self`
+            observed_main_input_name = list(model_signature.parameters.keys())[1]
+            self.assertEqual(model_class.main_input_name, observed_main_input_name)
+
+    def _generate_random_bad_tokens(self, num_bad_tokens, model):
+        # special tokens cannot be bad tokens
+        special_tokens = []
+        if model.config.bos_token_id is not None:
+            special_tokens.append(model.config.bos_token_id)
+        if model.config.pad_token_id is not None:
+            special_tokens.append(model.config.pad_token_id)
+        if model.config.eos_token_id is not None:
+            special_tokens.append(model.config.eos_token_id)
+
+        # create random bad tokens that are not special tokens
+        bad_tokens = []
+        while len(bad_tokens) < num_bad_tokens:
+            token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0]
+            if token not in special_tokens:
+                bad_tokens.append(token)
+        return bad_tokens
+
+    def _check_generated_ids(self, output_ids):
+        for token_id in output_ids[0].numpy().tolist():
+            self.assertGreaterEqual(token_id, 0)
+            self.assertLess(token_id, self.model_tester.vocab_size)
+
+    def _check_match_tokens(self, generated_ids, bad_words_ids):
+        # for all bad word tokens
+        for bad_word_ids in bad_words_ids:
+            # for all slices in batch
+            for generated_ids_slice in generated_ids:
+                # for all word idx
+                for i in range(len(bad_word_ids), len(generated_ids_slice)):
+                    # if tokens match
+                    if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids:
+                        return True
+        return False
+
+
+def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
+    """Creates a random int32 tensor of the shape within the vocab size."""
+    if rng is None:
+        rng = random.Random()
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.randint(0, vocab_size - 1))
+
+    output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32)
+
+    return output
+
+
+def random_attention_mask(shape, rng=None, name=None, dtype=None):
+    attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype)
+    # make sure that at least one token is attended to for each batch
+    attn_mask = tf.concat(
+        [
+            tf.constant(value=1, shape=(shape[0], 1), dtype=dtype),
+            attn_mask[:, 1:],
+        ],
+        axis=1,
+    )
+    return attn_mask
 
-    This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the
-    tensors in the first argument of the model call function: `model(inputs)`.
 
-    </Tip>
+def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = random.Random()
 
-    Parameters:
-        config ([`ConvNextConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
-"""
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
 
-CONVNEXT_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ConvNextFeatureExtractor`]. See
-            [`ConvNextFeatureExtractor.__call__`] for details.
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.random() * scale)
 
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
-            used instead.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used
-            in eager mode, in graph mode the value will always be set to True.
-"""
+    return tf.reshape(
+        tf.constant(values, dtype=dtype if dtype is not None else tf.float32),
+        shape=shape,
+    )
 
 
-@add_start_docstrings(
-    "The bare ConvNext model outputting raw features without any specific head on top.",
-    CONVNEXT_START_DOCSTRING,
-)
-class TFConvNextModel(TFConvNextPreTrainedModel):
-    def __init__(self, config, *inputs, add_pooling_layer=True, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.convnext = TFConvNextMainLayer(config, add_pooling_layer=add_pooling_layer, name="convnext")
-
-    @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
-    def call(
-        self,
-        pixel_values: Optional[TFModelInputType] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        training: bool = False,
-        **kwargs,
-    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-        r"""
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from transformers import ConvNextFeatureExtractor, TFConvNextModel
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> feature_extractor = ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224")
-        >>> model = TFConvNextModel.from_pretrained("facebook/convnext-tiny-224")
-
-        >>> inputs = feature_extractor(images=image, return_tensors="tf")
-        >>> outputs = model(**inputs)
-        >>> last_hidden_states = outputs.last_hidden_state
-        ```"""
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        inputs = input_processing(
-            func=self.call,
-            config=self.config,
-            input_ids=pixel_values,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            training=training,
-            kwargs_call=kwargs,
+@require_tf
+class UtilsFunctionsTest(unittest.TestCase):
+
+    # tests whether the top_k_top_p_filtering function behaves as expected
+    def test_top_k_top_p_filtering(self):
+        logits = tf.convert_to_tensor(
+            [
+                [
+                    8.2220991,  # 3rd highest value; idx. 0
+                    -0.5620044,
+                    5.23229752,
+                    4.0386393,
+                    -6.8798378,
+                    -0.54785802,
+                    -3.2012153,
+                    2.92777176,
+                    1.88171953,
+                    7.35341276,  # 5th highest value; idx. 9
+                    8.43207833,  # 2nd highest value; idx. 10
+                    -9.85711836,
+                    -5.96209236,
+                    -1.13039161,
+                    -7.1115294,
+                    -0.8369633,
+                    -5.3186408,
+                    7.06427407,
+                    0.81369344,
+                    -0.82023817,
+                    -5.9179796,
+                    0.58813443,
+                    -6.99778438,
+                    4.71551189,
+                    -0.18771637,
+                    7.44020759,  # 4th highest value; idx. 25
+                    9.38450987,  # 1st highest value; idx. 26
+                    2.12662941,
+                    -9.32562038,
+                    2.35652522,
+                ],  # cummulative prob of 5 highest values <= 0.6
+                [
+                    0.58425518,
+                    4.53139238,
+                    -5.57510464,
+                    -6.28030699,
+                    -7.19529503,
+                    -4.02122551,
+                    1.39337037,
+                    -6.06707057,
+                    1.59480517,
+                    -9.643119,
+                    0.03907799,
+                    0.67231762,
+                    -8.88206726,
+                    6.27115922,  # 4th highest value; idx. 13
+                    2.28520723,
+                    4.82767506,
+                    4.30421368,
+                    8.8275313,  # 2nd highest value; idx. 17
+                    5.44029958,  # 5th highest value; idx. 18
+                    -4.4735794,
+                    7.38579536,  # 3rd highest value; idx. 20
+                    -2.91051663,
+                    2.61946077,
+                    -2.5674762,
+                    -9.48959302,
+                    -4.02922645,
+                    -1.35416918,
+                    9.67702323,  # 1st highest value; idx. 27
+                    -5.89478553,
+                    1.85370467,
+                ],  # cummulative prob of 5 highest values <= 0.6
+            ],
+            dtype=tf.float32,
         )
 
-        if "input_ids" in inputs:
-            inputs["pixel_values"] = inputs.pop("input_ids")
-
-        if inputs["pixel_values"] is None:
-            raise ValueError("You have to specify pixel_values")
-
-        outputs = self.convnext(
-            pixel_values=inputs["pixel_values"],
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            training=inputs["training"],
-        )
+        non_inf_expected_idx = tf.convert_to_tensor(
+            [
+                [0, 0],
+                [0, 9],
+                [0, 10],
+                [0, 25],
+                [0, 26],
+                [1, 13],
+                [1, 17],
+                [1, 18],
+                [1, 20],
+                [1, 27],
+            ],
+            dtype=tf.int32,
+        )  # expected non filtered idx as noted above
+
+        non_inf_expected_output = tf.convert_to_tensor(
+            [
+                8.222099,
+                7.3534126,
+                8.432078,
+                7.4402075,
+                9.38451,
+                6.271159,
+                8.827531,
+                5.4402995,
+                7.3857956,
+                9.677023,
+            ],
+            dtype=tf.float32,
+        )  # expected non filtered values as noted above
 
-        # converts back NHWC -> NCHW, to match PT's output
-        if not return_dict:
-            return (tf.transpose(outputs[0], perm=(0, 3, 1, 2)),) + outputs[1:]
+        output = tf_top_k_top_p_filtering(logits, top_k=10, top_p=0.6, min_tokens_to_keep=4)
 
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=tf.transpose(outputs.last_hidden_state, perm=(0, 3, 1, 2)),
-            pooler_output=outputs.pooler_output,
-            hidden_states=outputs.hidden_states,
+        non_inf_output = output[output != -float("inf")]
+        non_inf_idx = tf.cast(
+            tf.where(tf.not_equal(output, tf.constant(-float("inf"), dtype=tf.float32))),
+            dtype=tf.int32,
         )
 
-
-@add_start_docstrings(
-    """
-    ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
-    ImageNet.
-    """,
-    CONVNEXT_START_DOCSTRING,
-)
-class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClassificationLoss):
-    def __init__(self, config: ConvNextConfig, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-
-        self.num_labels = config.num_labels
-        self.convnext = TFConvNextMainLayer(config, name="convnext")
-
-        # Classifier head
-        self.classifier = tf.keras.layers.Dense(
-            units=config.num_labels,
-            kernel_initializer=get_initializer(config.initializer_range),
-            bias_initializer="zeros",
-            name="classifier",
+        tf.debugging.assert_near(non_inf_output, non_inf_expected_output, rtol=1e-12)
+        tf.debugging.assert_equal(non_inf_idx, non_inf_expected_idx)
+
+
+@require_tf
+@is_staging_test
+class TFModelPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = login(username=USER, password=PASS)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, name="test-model-tf")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(
+                token=cls._token,
+                name="test-model-tf-org",
+                organization="valid_org",
+            )
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
         )
+        model = TFBertModel(config)
+        # Make sure model is properly initialized
+        _ = model(model.dummy_inputs)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                os.path.join(tmp_dir, "test-model-tf"),
+                push_to_hub=True,
+                use_auth_token=self._token,
+            )
 
-    @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
-    def call(
-        self,
-        pixel_values: Optional[TFModelInputType] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        training: Optional[bool] = False,
-        **kwargs,
-    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
-        r"""
-        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
-            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from transformers import ConvNextFeatureExtractor, TFConvNextForImageClassification
-        >>> import tensorflow as tf
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> feature_extractor = ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224")
-        >>> model = TFViTForImageClassification.from_pretrained("facebook/convnext-tiny-224")
-
-        >>> inputs = feature_extractor(images=image, return_tensors="tf")
-        >>> outputs = model(**inputs)
-        >>> logits = outputs.logits
-        >>> # model predicts one of the 1000 ImageNet classes
-        >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
-        >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
-        ```"""
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
+            models_equal = True
+            for p1, p2 in zip(model.weights, new_model.weights):
+                if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                    models_equal = False
+            self.assertTrue(models_equal)
+
+    def test_push_to_hub_with_model_card(self):
+        config = BertConfig(
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        inputs = input_processing(
-            func=self.call,
-            config=self.config,
-            input_ids=pixel_values,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            labels=labels,
-            training=training,
-            kwargs_call=kwargs,
+        model = TFBertModel(config)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.push_to_hub(os.path.join(tmp_dir, "test-model-tf"))
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "test-model-card-tf", "README.md")))
+
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
         )
+        model = TFBertModel(config)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                os.path.join(tmp_dir, "test-model-tf-org"),
+                push_to_hub=True,
+                use_auth_token=self._token,
+                organization="valid_org",
+            )
 
-        if "input_ids" in inputs:
-            inputs["pixel_values"] = inputs.pop("input_ids")
-
-        if inputs["pixel_values"] is None:
-            raise ValueError("You have to specify pixel_values")
-
-        outputs = self.convnext(
-            inputs["pixel_values"],
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            training=inputs["training"],
-        )
-
-        pooled_output = outputs.pooler_output if return_dict else outputs[1]
-
-        logits = self.classifier(pooled_output)
-        loss = None if inputs["labels"] is None else self.hf_compute_loss(labels=inputs["labels"], logits=logits)
-
-        if not inputs["return_dict"]:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
-        return TFSequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-        )
+            new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
+            models_equal = True
+            for p1, p2 in zip(model.weights, new_model.weights):
+                if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                    models_equal = False
+            self.assertTrue(models_equal)
diff --git a/tests/convnext/test_modeling_tf_convnext.py b/tests/convnext/test_modeling_tf_convnext.py
index 233ec6662b820..6f8c142b654d8 100644
--- a/tests/convnext/test_modeling_tf_convnext.py
+++ b/tests/convnext/test_modeling_tf_convnext.py
@@ -16,6 +16,7 @@
 
 import inspect
 import unittest
+from typing import List, Tuple
 
 from transformers import ConvNextConfig
 from transformers.file_utils import cached_property, is_tf_available, is_vision_available
@@ -222,6 +223,50 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    # Since ConvNext does not have any attention we need to rewrite this test.
+    def test_model_outputs_equivalence(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
+            dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+            def recursive_check(tuple_object, dict_object):
+                if isinstance(tuple_object, (List, Tuple)):
+                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                        recursive_check(tuple_iterable_value, dict_iterable_value)
+                elif tuple_object is None:
+                    return
+                else:
+                    self.assertTrue(
+                        all(tf.equal(tuple_object, dict_object)),
+                        msg=f"Tuple and dict output are not equal. Difference: {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}",
+                    )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
     def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index e072b4febd90b..2038f29e56cf8 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -83,7 +83,8 @@
             # Restrict TensorFlow to only allocate x GB of memory on the GPUs
             try:
                 tf.config.set_logical_device_configuration(
-                    gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)]
+                    gpu,
+                    [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)],
                 )
                 logical_gpus = tf.config.list_logical_devices("GPU")
                 print("Logical GPUs", logical_gpus)
@@ -116,7 +117,10 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d
 
         if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
             inputs_dict = {
-                k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
+                k: tf.tile(
+                    tf.expand_dims(v, 1),
+                    (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1),
+                )
                 if isinstance(v, tf.Tensor) and v.ndim > 0
                 else v
                 for k, v in inputs_dict.items()
@@ -144,7 +148,11 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d
                 *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING),
             ]:
                 inputs_dict["labels"] = tf.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
+                    (
+                        self.model_tester.batch_size,
+                        self.model_tester.seq_length,
+                    ),
+                    dtype=tf.int32,
                 )
         return inputs_dict
 
@@ -152,7 +160,10 @@ def test_initialization(self):
         pass
 
     def test_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -166,7 +177,10 @@ def test_save_load(self):
                 self.assert_outputs_same(after_outputs, outputs)
 
     def test_save_load_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -218,7 +232,10 @@ def test_onnx_compliancy(self):
         if not self.test_onnx:
             return
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         INTERNAL_OPS = [
             "Assert",
             "AssignVariableOp",
@@ -265,7 +282,10 @@ def test_onnx_runtime_optimize(self):
         import onnxruntime
         import tf2onnx
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -276,7 +296,10 @@ def test_onnx_runtime_optimize(self):
             onnxruntime.InferenceSession(onnx_model_proto.SerializeToString())
 
     def test_keras_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         tf_main_layer_classes = set(
             module_member
@@ -321,7 +344,8 @@ def test_keras_save_load(self):
                     )
                 else:
                     model = tf.keras.models.load_model(
-                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
+                        filepath,
+                        custom_objects={main_layer_class.__name__: main_layer_class},
                     )
                 assert isinstance(model, tf.keras.Model)
                 after_outputs = model(inputs_dict)
@@ -348,7 +372,10 @@ def test_pt_tf_model_equivalence(self):
 
         import transformers
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
@@ -361,7 +388,9 @@ def test_pt_tf_model_equivalence(self):
 
             # Check we can load pt model in tf and vice-versa with model => model functions
             tf_model = transformers.load_pytorch_model_in_tf2_model(
-                tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class)
+                tf_model,
+                pt_model,
+                tf_inputs=self._prepare_for_class(inputs_dict, model_class),
             )
             pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
 
@@ -382,7 +411,10 @@ def test_pt_tf_model_equivalence(self):
 
             with torch.no_grad():
                 pto = pt_model(**pt_inputs_dict)
-            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False)
+            tfo = tf_model(
+                self._prepare_for_class(inputs_dict, model_class),
+                training=False,
+            )
 
             tf_hidden_states = tfo[0].numpy()
             pt_hidden_states = pto[0].numpy()
@@ -441,14 +473,20 @@ def test_pt_tf_model_equivalence(self):
             self.assertLessEqual(max_diff, 4e-2)
 
     def test_compile_tf_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         max_input = getattr(self.model_tester, "max_position_embeddings", 512)
         optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
         loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
         metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
 
         for model_class in self.all_model_classes:
-            if model_class.__name__ in ["TFSpeech2TextModel", "TFSpeech2TextForConditionalGeneration"]:
+            if model_class.__name__ in [
+                "TFSpeech2TextModel",
+                "TFSpeech2TextForConditionalGeneration",
+            ]:
                 inputs = {
                     "decoder_input_ids": tf.keras.Input(
                         batch_shape=(2, max_input),
@@ -472,7 +510,11 @@ def test_compile_tf_model(self):
                         name="decoder_input_ids",
                         dtype="int32",
                     ),
-                    "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
+                    "input_ids": tf.keras.Input(
+                        batch_shape=(2, max_input),
+                        name="input_ids",
+                        dtype="int32",
+                    ),
                 }
             # `pixel_values` implies that the input is an image
             elif model_class.main_input_name == "pixel_values":
@@ -488,7 +530,11 @@ def test_compile_tf_model(self):
                 )
             elif model_class.__name__ in ["TFCLIPModel"]:
                 inputs = {
-                    "input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"),
+                    "input_ids": tf.keras.Input(
+                        batch_shape=(3, max_input),
+                        name="input_ids",
+                        dtype="int32",
+                    ),
                     "pixel_values": tf.keras.Input(
                         batch_shape=(
                             3,
@@ -501,7 +547,11 @@ def test_compile_tf_model(self):
                     ),
                 }
             elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32")
+                inputs = tf.keras.Input(
+                    batch_shape=(4, 2, max_input),
+                    name="input_ids",
+                    dtype="int32",
+                )
             else:
                 inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
 
@@ -524,7 +574,10 @@ def test_compile_tf_model(self):
             extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 
     def test_keyword_and_dict_args(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -540,10 +593,21 @@ def test_keyword_and_dict_args(self):
             self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
 
     def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
+        decoder_seq_length = getattr(
+            self.model_tester,
+            "decoder_seq_length",
+            self.model_tester.seq_length,
+        )
+        encoder_seq_length = getattr(
+            self.model_tester,
+            "encoder_seq_length",
+            self.model_tester.seq_length,
+        )
         decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
         encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
 
@@ -554,7 +618,11 @@ def check_decoder_attentions_output(outputs):
             self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
             self.assertListEqual(
                 list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                [
+                    self.model_tester.num_attention_heads,
+                    decoder_seq_length,
+                    decoder_key_length,
+                ],
             )
 
         def check_encoder_attentions_output(outputs):
@@ -564,7 +632,11 @@ def check_encoder_attentions_output(outputs):
             self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
             self.assertListEqual(
                 list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                [
+                    self.model_tester.num_attention_heads,
+                    encoder_seq_length,
+                    encoder_key_length,
+                ],
             )
 
         for model_class in self.all_model_classes:
@@ -606,7 +678,10 @@ def test_headmasking(self):
             return
 
         random.Random().seed(42)
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         random.Random().seed()
 
         inputs_dict["output_attentions"] = True
@@ -619,11 +694,19 @@ def test_headmasking(self):
             def prepare_layer_head_mask(i, attention_heads, num_hidden_layers):
                 if i == 0:
                     return tf.concat(
-                        (tf.zeros(1, dtype=tf.float32), tf.ones(attention_heads - 1, dtype=tf.float32)), 0
+                        (
+                            tf.zeros(1, dtype=tf.float32),
+                            tf.ones(attention_heads - 1, dtype=tf.float32),
+                        ),
+                        0,
                     )
                 elif i == num_hidden_layers - 1:
                     return tf.concat(
-                        (tf.zeros(attention_heads - 1, dtype=tf.float32), tf.ones(1, dtype=tf.float32)), 0
+                        (
+                            tf.zeros(attention_heads - 1, dtype=tf.float32),
+                            tf.ones(1, dtype=tf.float32),
+                        ),
+                        0,
                     )
                 else:
                     return tf.ones(attention_heads, dtype=tf.float32)
@@ -652,7 +735,8 @@ def check_attentions_validity(attentions):
                 # Remove Nan
                 for t in attentions:
                     self.assertLess(
-                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), (tf.size(t) / 4).numpy()
+                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(),
+                        (tf.size(t) / 4).numpy(),
                     )  # Check we don't have more than 25% nans (arbitrary)
 
                 attentions = [
@@ -660,11 +744,23 @@ def check_attentions_validity(attentions):
                 ]  # remove them (the test is less complete)
 
                 self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0)
-                self.assertNotEqual(tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), 0.0)
+                self.assertNotEqual(
+                    tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(),
+                    0.0,
+                )
                 if len(attentions) > 2:  # encoder-decodere models have only 2 layers in each modules
-                    self.assertNotEqual(tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), 0.0)
-                self.assertAlmostEqual(tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), 0.0)
-                self.assertNotEqual(tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), 0.0)
+                    self.assertNotEqual(
+                        tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(),
+                        0.0,
+                    )
+                self.assertAlmostEqual(
+                    tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(),
+                    0.0,
+                )
+                self.assertNotEqual(
+                    tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(),
+                    0.0,
+                )
 
             if model.config.is_encoder_decoder:
                 check_attentions_validity(outputs.encoder_attentions)
@@ -675,13 +771,18 @@ def check_attentions_validity(attentions):
                 check_attentions_validity(outputs.attentions)
 
     def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_hidden_states_output(config, inputs_dict, model_class):
             model = model_class(config)
             outputs = model(self._prepare_for_class(inputs_dict, model_class))
             expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+                self.model_tester,
+                "expected_num_hidden_layers",
+                self.model_tester.num_hidden_layers + 1,
             )
 
             if model.config.is_encoder_decoder:
@@ -692,12 +793,18 @@ def check_hidden_states_output(config, inputs_dict, model_class):
                 self.assertEqual(len(encoder_hidden_states), expected_num_layers)
                 self.assertListEqual(
                     list(encoder_hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                    [
+                        self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
                 )
                 self.assertEqual(len(decoder_hidden_states), expected_num_layers)
                 self.assertListEqual(
                     list(decoder_hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                    [
+                        self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
                 )
             else:
                 hidden_states = outputs.hidden_states
@@ -705,7 +812,10 @@ def check_hidden_states_output(config, inputs_dict, model_class):
                 self.assertEqual(len(hidden_states), expected_num_layers)
                 self.assertListEqual(
                     list(hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                    [
+                        self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
                 )
 
         for model_class in self.all_model_classes:
@@ -717,7 +827,10 @@ def check_hidden_states_output(config, inputs_dict, model_class):
             check_hidden_states_output(config, inputs_dict, model_class)
 
     def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         text_in_text_out_models = (
             get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
             + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
@@ -747,13 +860,22 @@ def test_model_common_attributes(self):
                 assert name is None
 
     def test_determinism(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
             first, second = (
-                model(self._prepare_for_class(inputs_dict, model_class), training=False)[0],
-                model(self._prepare_for_class(inputs_dict, model_class), training=False)[0],
+                model(
+                    self._prepare_for_class(inputs_dict, model_class),
+                    training=False,
+                )[0],
+                model(
+                    self._prepare_for_class(inputs_dict, model_class),
+                    training=False,
+                )[0],
             )
             out_1 = first.numpy()
             out_2 = second.numpy()
@@ -764,7 +886,10 @@ def test_determinism(self):
 
     def test_model_outputs_equivalence(self):
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
             tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
@@ -799,30 +924,32 @@ def recursive_check(tuple_object, dict_object):
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
             check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
 
-            # Pure conv models (such as ConvNeXt) don't have `output_attentions`.
-            if config.output_attentions:
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
 
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
 
-            if config.output_attentions:
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
 
-            if config.output_attentions:
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(
-                    model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
-                )
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(
+                model,
+                tuple_inputs,
+                dict_inputs,
+                {"output_hidden_states": True, "output_attentions": True},
+            )
 
     def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -849,7 +976,10 @@ def test_inputs_embeds(self):
             model(inputs)
 
     def test_numpy_arrays_inputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         def prepare_numpy_arrays(inputs_dict):
             inputs_np_dict = {}
@@ -874,7 +1004,10 @@ def prepare_numpy_arrays(inputs_dict):
     def test_resize_token_embeddings(self):
         if not self.test_resize_embeddings:
             return
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         def _get_word_embedding_weight(model, embedding_layer):
             embeds = getattr(embedding_layer, "weight", None)
@@ -933,16 +1066,25 @@ def _get_word_embedding_weight(model, embedding_layer):
 
                 if old_output_embeddings is not None and new_output_embeddings is not None:
                     self.assertEqual(new_output_embeddings.shape[0], assert_size)
-                    self.assertEqual(new_output_embeddings.shape[1], old_output_embeddings.shape[1])
+                    self.assertEqual(
+                        new_output_embeddings.shape[1],
+                        old_output_embeddings.shape[1],
+                    )
 
                     models_equal = True
-                    for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
+                    for p1, p2 in zip(
+                        old_output_embeddings.value(),
+                        new_output_embeddings.value(),
+                    ):
                         if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
                             models_equal = False
                     self.assertTrue(models_equal)
 
     def test_lm_head_model_random_no_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
 
         # iterate over all generative models
@@ -969,16 +1111,25 @@ def test_lm_head_model_random_no_beam_search_generate(self):
 
             # check bad words tokens language generation
             # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
+            bad_words_ids = [
+                self._generate_random_bad_tokens(1, model),
+                self._generate_random_bad_tokens(2, model),
+            ]
             output_tokens = model.generate(
-                input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
+                input_ids,
+                do_sample=True,
+                bad_words_ids=bad_words_ids,
+                num_return_sequences=2,
             )
             # only count generated tokens
             generated_ids = output_tokens[:, input_ids.shape[-1] :]
             self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
 
     def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
         if input_ids is None:
             input_ids = inputs_dict.get("input_features", None)
@@ -1011,7 +1162,10 @@ def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
                 self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput)
 
     def test_lm_head_model_random_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
 
         for model_class in self.all_generative_model_classes:
@@ -1026,7 +1180,12 @@ def test_lm_head_model_random_beam_search_generate(self):
 
             with self.assertRaises(AssertionError):
                 # generating more sequences than having beams leads is not possible
-                model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
+                model.generate(
+                    input_ids,
+                    do_sample=False,
+                    num_return_sequences=3,
+                    num_beams=2,
+                )
 
             # num_return_sequences > 1, sample
             self._check_generated_ids(
@@ -1038,20 +1197,37 @@ def test_lm_head_model_random_beam_search_generate(self):
                 )
             )
             # num_return_sequences > 1, greedy
-            self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2))
+            self._check_generated_ids(
+                model.generate(
+                    input_ids,
+                    do_sample=False,
+                    num_beams=2,
+                    num_return_sequences=2,
+                )
+            )
 
             # check bad words tokens language generation
             # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
+            bad_words_ids = [
+                self._generate_random_bad_tokens(1, model),
+                self._generate_random_bad_tokens(2, model),
+            ]
             output_tokens = model.generate(
-                input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
+                input_ids,
+                do_sample=False,
+                bad_words_ids=bad_words_ids,
+                num_beams=2,
+                num_return_sequences=2,
             )
             # only count generated tokens
             generated_ids = output_tokens[:, input_ids.shape[-1] :]
             self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
 
     def test_lm_head_model_beam_search_generate_dict_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
         if input_ids is None:
             input_ids = inputs_dict.get("input_features", None)
@@ -1086,14 +1262,20 @@ def test_lm_head_model_beam_search_generate_dict_outputs(self):
                 self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput)
 
     def test_loss_computation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config)
             if getattr(model, "hf_compute_loss", None):
                 # The number of elements in the loss should be the same as the number of elements in the label
                 prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
                 added_label = prepared_for_class[
-                    sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]
+                    sorted(
+                        list(prepared_for_class.keys() - inputs_dict.keys()),
+                        reverse=True,
+                    )[0]
                 ]
                 loss_size = tf.size(added_label)
 
@@ -1104,7 +1286,11 @@ def test_loss_computation(self):
 
                 # Test that model correctly compute the loss with kwargs
                 prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                possible_input_names = {"input_ids", "pixel_values", "input_features"}
+                possible_input_names = {
+                    "input_ids",
+                    "pixel_values",
+                    "input_features",
+                }
                 input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
                 model_input = prepared_for_class.pop(input_name)
 
@@ -1148,8 +1334,15 @@ def test_loss_computation(self):
                 self.assertEqual(loss.shape, [loss_size])
 
     def test_generate_with_headmasking(self):
-        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        attention_names = [
+            "encoder_attentions",
+            "decoder_attentions",
+            "cross_attentions",
+        ]
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_generative_model_classes:
             model = model_class(config)
@@ -1184,7 +1377,10 @@ def test_generate_with_headmasking(self):
     def test_load_with_mismatched_shapes(self):
         if not self.test_mismatched_shapes:
             return
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
@@ -1291,7 +1487,13 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
 def random_attention_mask(shape, rng=None, name=None, dtype=None):
     attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype)
     # make sure that at least one token is attended to for each batch
-    attn_mask = tf.concat([tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), attn_mask[:, 1:]], axis=1)
+    attn_mask = tf.concat(
+        [
+            tf.constant(value=1, shape=(shape[0], 1), dtype=dtype),
+            attn_mask[:, 1:],
+        ],
+        axis=1,
+    )
     return attn_mask
 
 
@@ -1308,7 +1510,10 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
     for _ in range(total_dims):
         values.append(rng.random() * scale)
 
-    return tf.reshape(tf.constant(values, dtype=dtype if dtype is not None else tf.float32), shape=shape)
+    return tf.reshape(
+        tf.constant(values, dtype=dtype if dtype is not None else tf.float32),
+        shape=shape,
+    )
 
 
 @require_tf
@@ -1387,12 +1592,34 @@ def test_top_k_top_p_filtering(self):
         )
 
         non_inf_expected_idx = tf.convert_to_tensor(
-            [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]],
+            [
+                [0, 0],
+                [0, 9],
+                [0, 10],
+                [0, 25],
+                [0, 26],
+                [1, 13],
+                [1, 17],
+                [1, 18],
+                [1, 20],
+                [1, 27],
+            ],
             dtype=tf.int32,
         )  # expected non filtered idx as noted above
 
         non_inf_expected_output = tf.convert_to_tensor(
-            [8.222099, 7.3534126, 8.432078, 7.4402075, 9.38451, 6.271159, 8.827531, 5.4402995, 7.3857956, 9.677023],
+            [
+                8.222099,
+                7.3534126,
+                8.432078,
+                7.4402075,
+                9.38451,
+                6.271159,
+                8.827531,
+                5.4402995,
+                7.3857956,
+                9.677023,
+            ],
             dtype=tf.float32,
         )  # expected non filtered values as noted above
 
@@ -1423,19 +1650,31 @@ def tearDownClass(cls):
             pass
 
         try:
-            delete_repo(token=cls._token, name="test-model-tf-org", organization="valid_org")
+            delete_repo(
+                token=cls._token,
+                name="test-model-tf-org",
+                organization="valid_org",
+            )
         except HTTPError:
             pass
 
     def test_push_to_hub(self):
         config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
         )
         model = TFBertModel(config)
         # Make sure model is properly initialized
         _ = model(model.dummy_inputs)
         with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, "test-model-tf"), push_to_hub=True, use_auth_token=self._token)
+            model.save_pretrained(
+                os.path.join(tmp_dir, "test-model-tf"),
+                push_to_hub=True,
+                use_auth_token=self._token,
+            )
 
             new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
             models_equal = True
@@ -1446,7 +1685,11 @@ def test_push_to_hub(self):
 
     def test_push_to_hub_with_model_card(self):
         config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
         )
         model = TFBertModel(config)
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -1455,7 +1698,11 @@ def test_push_to_hub_with_model_card(self):
 
     def test_push_to_hub_in_organization(self):
         config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
         )
         model = TFBertModel(config)
         with tempfile.TemporaryDirectory() as tmp_dir:

From 98111f8500b16b258191775e493cb6dd8ce5e37f Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 24 Feb 2022 13:34:05 +0530
Subject: [PATCH 51/65] fix: convnext tests.

---
 .../models/convnext/modeling_tf_convnext.py   | 2205 +++++------------
 tests/convnext/test_modeling_tf_convnext.py   |    4 +-
 2 files changed, 554 insertions(+), 1655 deletions(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 2038f29e56cf8..328194dddbc2c 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2019 HuggingFace Inc.
+# Copyright 2022 Meta Platforms Inc. and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,1710 +12,609 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" TF 2.0 ConvNext model."""
 
 
-import copy
-import inspect
-import json
-import os
-import random
-import tempfile
-import unittest
-from importlib import import_module
-from typing import List, Tuple
-
-from huggingface_hub import delete_repo, login
-from requests.exceptions import HTTPError
-from transformers import is_tf_available
-from transformers.models.auto import get_values
-from transformers.testing_utils import tooslow  # noqa: F401
-from transformers.testing_utils import (
-    PASS,
-    USER,
-    CaptureLogger,
-    _tf_gpu_memory_limit,
-    is_pt_tf_cross_test,
-    is_staging_test,
-    require_tf,
-    require_tf2onnx,
-    slow,
+from typing import Dict, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling, TFSequenceClassifierOutput
+from ...modeling_tf_utils import (
+    TFModelInputType,
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    get_initializer,
+    input_processing,
+    keras_serializable,
 )
-from transformers.utils import logging
-
-
-if is_tf_available():
-    import numpy as np
-    import tensorflow as tf
-
-    from transformers import (
-        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_MASKED_LM_MAPPING,
-        TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
-        TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
-        TF_MODEL_FOR_PRETRAINING_MAPPING,
-        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
-        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        BertConfig,
-        TFAutoModel,
-        TFAutoModelForSequenceClassification,
-        TFBertModel,
-        TFSharedEmbeddings,
-        tf_top_k_top_p_filtering,
-    )
-    from transformers.generation_tf_utils import (
-        TFBeamSampleDecoderOnlyOutput,
-        TFBeamSampleEncoderDecoderOutput,
-        TFBeamSearchDecoderOnlyOutput,
-        TFBeamSearchEncoderDecoderOutput,
-        TFGreedySearchDecoderOnlyOutput,
-        TFGreedySearchEncoderDecoderOutput,
-        TFSampleDecoderOnlyOutput,
-        TFSampleEncoderDecoderOutput,
-    )
+from ...utils import logging
+from .configuration_convnext import ConvNextConfig
 
-    if _tf_gpu_memory_limit is not None:
-        gpus = tf.config.list_physical_devices("GPU")
-        for gpu in gpus:
-            # Restrict TensorFlow to only allocate x GB of memory on the GPUs
-            try:
-                tf.config.set_logical_device_configuration(
-                    gpu,
-                    [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)],
-                )
-                logical_gpus = tf.config.list_logical_devices("GPU")
-                print("Logical GPUs", logical_gpus)
-            except RuntimeError as e:
-                # Virtual devices must be set before GPUs have been initialized
-                print(e)
-
-
-def _config_zero_init(config):
-    configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
-        if "_range" in key or "_std" in key:
-            setattr(configs_no_init, key, 0.0)
-    return configs_no_init
-
-
-@require_tf
-class TFModelTesterMixin:
-
-    model_tester = None
-    all_model_classes = ()
-    all_generative_model_classes = ()
-    test_mismatched_shapes = True
-    test_resize_embeddings = True
-    test_head_masking = True
-    is_encoder_decoder = False
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-            inputs_dict = {
-                k: tf.tile(
-                    tf.expand_dims(v, 1),
-                    (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1),
-                )
-                if isinstance(v, tf.Tensor) and v.ndim > 0
-                else v
-                for k, v in inputs_dict.items()
-            }
 
-        if return_labels:
-            if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
-                inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-                inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in [
-                *get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
-                *get_values(TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
-            ]:
-                inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING):
-                inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in [
-                *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
-                *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING),
-                *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
-                *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
-                *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
-                *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING),
-            ]:
-                inputs_dict["labels"] = tf.zeros(
-                    (
-                        self.model_tester.batch_size,
-                        self.model_tester.seq_length,
-                    ),
-                    dtype=tf.int32,
-                )
-        return inputs_dict
-
-    def test_initialization(self):
-        pass
-
-    def test_save_load(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=False)
-                model = model_class.from_pretrained(tmpdirname)
-                after_outputs = model(self._prepare_for_class(inputs_dict, model_class))
-
-                self.assert_outputs_same(after_outputs, outputs)
-
-    def test_save_load_config(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            model_config = model.get_config()
-            # make sure that returned config is jsonifiable, which is required by keras
-            json.dumps(model_config)
-            new_model = model_class.from_config(model.get_config())
-            # make sure it also accepts a normal config
-            _ = model_class.from_config(model.config)
-            _ = new_model(self._prepare_for_class(inputs_dict, model_class))  # Build model
-            new_model.set_weights(model.get_weights())
-            after_outputs = new_model(self._prepare_for_class(inputs_dict, model_class))
-
-            self.assert_outputs_same(after_outputs, outputs)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model.config.is_encoder_decoder:
-                expected_arg_names = [
-                    "input_ids",
-                    "attention_mask",
-                    "decoder_input_ids",
-                    "decoder_attention_mask",
-                ]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask"] if "head_mask" and "decoder_head_mask" in arg_names else []
-                )
-                # Necessary to handle BART with newly added cross_attn_head_mask
-                expected_arg_names.extend(
-                    ["cross_attn_head_mask", "encoder_outputs"]
-                    if "cross_attn_head_mask" in arg_names
-                    else ["encoder_outputs"]
-                )
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-            else:
-                expected_arg_names = ["input_ids"]
-                self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_onnx_compliancy(self):
-        if not self.test_onnx:
-            return
-
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        INTERNAL_OPS = [
-            "Assert",
-            "AssignVariableOp",
-            "EmptyTensorList",
-            "ReadVariableOp",
-            "ResourceGather",
-            "TruncatedNormal",
-            "VarHandleOp",
-            "VarIsInitializedOp",
-        ]
-        onnx_ops = []
+logger = logging.get_logger(__name__)
+
 
-        with open(os.path.join(".", "utils", "tf_ops", "onnx.json")) as f:
-            onnx_opsets = json.load(f)["opsets"]
+_CONFIG_FOR_DOC = "ConvNextConfig"
+_CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224"
 
-        for i in range(1, self.onnx_min_opset + 1):
-            onnx_ops.extend(onnx_opsets[str(i)])
 
-        for model_class in self.all_model_classes:
-            model_op_names = set()
+class TFConvNextDropPath(tf.keras.layers.Layer):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    References:
+        (1) github.com:rwightman/pytorch-image-models
+    """
 
-            with tf.Graph().as_default() as g:
-                model = model_class(config)
-                model(model.dummy_inputs)
+    def __init__(self, drop_path, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_path = drop_path
 
-                for op in g.get_operations():
-                    model_op_names.add(op.node_def.op)
+    def call(self, x, training=None):
+        if training:
+            keep_prob = 1 - self.drop_path
+            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
+            random_tensor = tf.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
 
-            model_op_names = sorted(model_op_names)
-            incompatible_ops = []
 
-            for op in model_op_names:
-                if op not in onnx_ops and op not in INTERNAL_OPS:
-                    incompatible_ops.append(op)
+class TFConvNextEmbeddings(tf.keras.layers.Layer):
+    """This class is comparable to (and inspired by) the SwinEmbeddings class
+    found in src/transformers/models/swin/modeling_swin.py.
+    """
 
-            self.assertEqual(len(incompatible_ops), 0, incompatible_ops)
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.patch_embeddings = tf.keras.layers.Conv2D(
+            filters=config.hidden_sizes[0],
+            kernel_size=config.patch_size,
+            strides=config.patch_size,
+            name="patch_embeddings",
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer="zeros",
+        )
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
+
+    def call(self, pixel_values):
+        if isinstance(pixel_values, dict):
+            pixel_values = pixel_values["pixel_values"]
+
+        # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format.
+        # So change the input format from `NCHW` to `NHWC`.
+        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+
+        embeddings = self.patch_embeddings(pixel_values)
+        embeddings = self.layernorm(embeddings)
+        return embeddings
+
+
+class TFConvNextLayer(tf.keras.layers.Layer):
+    """This corresponds to the `Block` class in the original implementation.
+
+    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
+    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back
+
+    The authors used (2) as they find it slightly faster in PyTorch. Since we already permuted the inputs to follow
+    NHWC ordering, we can just apply the operations straight-away without the permutation.
+
+    Args:
+        config ([`ConvNextConfig`]): Model configuration class.
+        dim (`int`): Number of input channels.
+        drop_path (`float`): Stochastic depth rate. Default: 0.0.
+    """
+
+    def __init__(self, config, dim, drop_path=0.0, **kwargs):
+        super().__init__(**kwargs)
+        self.dim = dim
+        self.config = config
+        self.dwconv = tf.keras.layers.Conv2D(
+            filters=dim,
+            kernel_size=7,
+            padding="same",
+            groups=dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer="zeros",
+            name="dwconv",
+        )  # depthwise conv
+        self.layernorm = tf.keras.layers.LayerNormalization(
+            epsilon=1e-6,
+            name="layernorm",
+        )
+        self.pwconv1 = tf.keras.layers.Dense(
+            units=4 * dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer="zeros",
+            name="pwconv1",
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = get_tf_activation(config.hidden_act)
+        self.pwconv2 = tf.keras.layers.Dense(
+            units=dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer="zeros",
+            name="pwconv2",
+        )
+        # Using `layers.Activation` instead of `tf.identity` to better control `training`
+        # behaviour.
+        self.drop_path = (
+            TFConvNextDropPath(
+                drop_path,
+                name="drop_path",
+            )
+            if drop_path > 0.0
+            else tf.keras.layers.Activation(
+                "linear",
+                name="drop_path",
+            )
+        )
 
-    @require_tf2onnx
-    @slow
-    def test_onnx_runtime_optimize(self):
-        if not self.test_onnx:
-            return
+    def build(self, input_shape: tf.TensorShape):
+        # PT's `nn.Parameters` must be mapped to a TF layer weight to inherit the same name hierarchy (and vice-versa)
+        self.layer_scale_parameter = (
+            self.add_weight(
+                shape=(self.dim,),
+                initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_parameter",
+            )
+            if self.config.layer_scale_init_value > 0
+            else None
+        )
+        super().build(input_shape)
+
+    def call(self, hidden_states, training=False):
+        input = hidden_states
+        x = self.dwconv(hidden_states)
+        x = self.layernorm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+
+        if self.layer_scale_parameter is not None:
+            x = self.layer_scale_parameter * x
+
+        x = input + self.drop_path(x, training=training)
+        return x
+
+
+class TFConvNextStage(tf.keras.layers.Layer):
+    """ConvNext stage, consisting of an optional downsampling layer + multiple residual blocks.
+
+    Args:
+        config ([`ConvNextConfig`]): Model configuration class.
+        in_channels (`int`): Number of input channels.
+        out_channels (`int`): Number of output channels.
+        depth (`int`): Number of residual blocks.
+        drop_path_rates(`List[float]`): Stochastic depth rates for each layer.
+    """
+
+    def __init__(
+        self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs
+    ):
+        super().__init__(**kwargs)
+        if in_channels != out_channels or stride > 1:
+            self.downsampling_layer = [
+                tf.keras.layers.LayerNormalization(
+                    epsilon=1e-6,
+                    name="downsampling_layer.0",
+                ),
+                # Inputs to this layer will follow NHWC format since we
+                # transposed the inputs from NCHW to NHWC in the `TFConvNextEmbeddings`
+                # layer. All the outputs throughout the model will be in NHWC
+                # from this point on until the output where we again change to
+                # NCHW.
+                tf.keras.layers.Conv2D(
+                    filters=out_channels,
+                    kernel_size=kernel_size,
+                    strides=stride,
+                    kernel_initializer=get_initializer(config.initializer_range),
+                    bias_initializer="zeros",
+                    name="downsampling_layer.1",
+                ),
+            ]
+        else:
+            self.downsampling_layer = [tf.identity]
+
+        drop_path_rates = drop_path_rates or [0.0] * depth
+        self.layers = [
+            TFConvNextLayer(
+                config,
+                dim=out_channels,
+                drop_path=drop_path_rates[j],
+                name=f"layers.{j}",
+            )
+            for j in range(depth)
+        ]
 
-        import onnxruntime
-        import tf2onnx
+    def call(self, hidden_states):
+        for layer in self.downsampling_layer:
+            hidden_states = layer(hidden_states)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class TFConvNextEncoder(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.stages = []
+        drop_path_rates = [x for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))]
+        cur = 0
+        prev_chs = config.hidden_sizes[0]
+        for i in range(config.num_stages):
+            out_chs = config.hidden_sizes[i]
+            stage = TFConvNextStage(
+                config,
+                in_channels=prev_chs,
+                out_channels=out_chs,
+                stride=2 if i > 0 else 1,
+                depth=config.depths[i],
+                drop_path_rates=drop_path_rates[cur],
+                name=f"stages.{i}",
+            )
+            self.stages.append(stage)
+            cur += config.depths[i]
+            prev_chs = out_chs
 
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+    def call(self, hidden_states, output_hidden_states=False, return_dict=True):
+        all_hidden_states = () if output_hidden_states else None
 
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model(model.dummy_inputs)
+        for i, layer_module in enumerate(self.stages):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
 
-            onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset)
+            hidden_states = layer_module(hidden_states)
 
-            onnxruntime.InferenceSession(onnx_model_proto.SerializeToString())
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
 
-    def test_keras_save_load(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
 
-        tf_main_layer_classes = set(
-            module_member
-            for model_class in self.all_model_classes
-            for module in (import_module(model_class.__module__),)
-            for module_member_name in dir(module)
-            if module_member_name.endswith("MainLayer")
-            # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
-            and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
-            for module_member in (getattr(module, module_member_name),)
-            if isinstance(module_member, type)
-            and tf.keras.layers.Layer in module_member.__bases__
-            and getattr(module_member, "_keras_serializable", False)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
         )
-        for main_layer_class in tf_main_layer_classes:
-            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
-            if "T5" in main_layer_class.__name__:
-                # Take the same values than in TFT5ModelTester for this shared layer
-                shared = TFSharedEmbeddings(99, 32, name="shared")
-                config.use_cache = inputs_dict.pop("use_cache", None)
-                main_layer = main_layer_class(config, embed_tokens=shared)
-            else:
-                main_layer = main_layer_class(config)
-
-            symbolic_inputs = {
-                name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
-            }
 
-            model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
-            outputs = model(inputs_dict)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                filepath = os.path.join(tmpdirname, "keras_model.h5")
-                model.save(filepath)
-                if "T5" in main_layer_class.__name__:
-                    model = tf.keras.models.load_model(
-                        filepath,
-                        custom_objects={
-                            main_layer_class.__name__: main_layer_class,
-                            "TFSharedEmbeddings": TFSharedEmbeddings,
-                        },
-                    )
-                else:
-                    model = tf.keras.models.load_model(
-                        filepath,
-                        custom_objects={main_layer_class.__name__: main_layer_class},
-                    )
-                assert isinstance(model, tf.keras.Model)
-                after_outputs = model(inputs_dict)
-                self.assert_outputs_same(after_outputs, outputs)
-
-    def assert_outputs_same(self, after_outputs, outputs):
-        # Make sure we don't have nans
-        if isinstance(after_outputs, tf.Tensor):
-            out_1 = after_outputs.numpy()
-        elif isinstance(after_outputs, dict):
-            out_1 = after_outputs[list(after_outputs.keys())[0]].numpy()
-        else:
-            out_1 = after_outputs[0].numpy()
-        out_2 = outputs[0].numpy()
-        self.assertEqual(out_1.shape, out_2.shape)
-        out_1 = out_1[~np.isnan(out_1)]
-        out_2 = out_2[~np.isnan(out_2)]
-        max_diff = np.amax(np.abs(out_1 - out_2))
-        self.assertLessEqual(max_diff, 1e-5)
-
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self):
-        import torch
-
-        import transformers
-
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
-            pt_model_class = getattr(transformers, pt_model_class_name)
-
-            config.output_hidden_states = True
-
-            tf_model = model_class(config)
-            pt_model = pt_model_class(config)
-
-            # Check we can load pt model in tf and vice-versa with model => model functions
-            tf_model = transformers.load_pytorch_model_in_tf2_model(
-                tf_model,
-                pt_model,
-                tf_inputs=self._prepare_for_class(inputs_dict, model_class),
-            )
-            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
-
-            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-            pt_model.eval()
-            pt_inputs_dict = {}
-            for name, key in self._prepare_for_class(inputs_dict, model_class).items():
-                if type(key) == bool:
-                    pt_inputs_dict[name] = key
-                elif name == "input_values":
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-                elif name == "pixel_values":
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-                elif name == "input_features":
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-                else:
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
-
-            with torch.no_grad():
-                pto = pt_model(**pt_inputs_dict)
-            tfo = tf_model(
-                self._prepare_for_class(inputs_dict, model_class),
-                training=False,
-            )
 
-            tf_hidden_states = tfo[0].numpy()
-            pt_hidden_states = pto[0].numpy()
-
-            tf_nans = np.copy(np.isnan(tf_hidden_states))
-            pt_nans = np.copy(np.isnan(pt_hidden_states))
-
-            pt_hidden_states[tf_nans] = 0
-            tf_hidden_states[tf_nans] = 0
-            pt_hidden_states[pt_nans] = 0
-            tf_hidden_states[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
-            self.assertLessEqual(max_diff, 4e-2)
-
-            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
-                torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
-
-                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
-                tf_model.save_weights(tf_checkpoint_path)
-                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
-
-            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-            pt_model.eval()
-            pt_inputs_dict = {}
-            for name, key in self._prepare_for_class(inputs_dict, model_class).items():
-                if type(key) == bool:
-                    key = np.array(key, dtype=bool)
-                    pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long)
-                elif name == "input_values":
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-                elif name == "pixel_values":
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-                elif name == "input_features":
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-                else:
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
-
-            with torch.no_grad():
-                pto = pt_model(**pt_inputs_dict)
-            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class))
-            tfo = tfo[0].numpy()
-            pto = pto[0].numpy()
-            tf_nans = np.copy(np.isnan(tfo))
-            pt_nans = np.copy(np.isnan(pto))
-
-            pto[tf_nans] = 0
-            tfo[tf_nans] = 0
-            pto[pt_nans] = 0
-            tfo[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(tfo - pto))
-            self.assertLessEqual(max_diff, 4e-2)
-
-    def test_compile_tf_model(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        max_input = getattr(self.model_tester, "max_position_embeddings", 512)
-        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ in [
-                "TFSpeech2TextModel",
-                "TFSpeech2TextForConditionalGeneration",
-            ]:
-                inputs = {
-                    "decoder_input_ids": tf.keras.Input(
-                        batch_shape=(2, max_input),
-                        name="decoder_input_ids",
-                        dtype="int32",
-                    ),
-                    "input_features": tf.keras.Input(
-                        batch_shape=(
-                            2,
-                            max_input,
-                            self.model_tester.input_feat_per_channel * self.model_tester.input_channels,
-                        ),
-                        name="input_features",
-                        dtype="float32",
-                    ),
-                }
-            elif self.is_encoder_decoder:
-                inputs = {
-                    "decoder_input_ids": tf.keras.Input(
-                        batch_shape=(2, max_input),
-                        name="decoder_input_ids",
-                        dtype="int32",
-                    ),
-                    "input_ids": tf.keras.Input(
-                        batch_shape=(2, max_input),
-                        name="input_ids",
-                        dtype="int32",
-                    ),
-                }
-            # `pixel_values` implies that the input is an image
-            elif model_class.main_input_name == "pixel_values":
-                inputs = tf.keras.Input(
-                    batch_shape=(
-                        3,
-                        self.model_tester.num_channels,
-                        self.model_tester.image_size,
-                        self.model_tester.image_size,
-                    ),
-                    name="pixel_values",
-                    dtype="float32",
-                )
-            elif model_class.__name__ in ["TFCLIPModel"]:
-                inputs = {
-                    "input_ids": tf.keras.Input(
-                        batch_shape=(3, max_input),
-                        name="input_ids",
-                        dtype="int32",
-                    ),
-                    "pixel_values": tf.keras.Input(
-                        batch_shape=(
-                            3,
-                            self.model_tester.vision_model_tester.num_channels,
-                            self.model_tester.vision_model_tester.image_size,
-                            self.model_tester.vision_model_tester.image_size,
-                        ),
-                        name="pixel_values",
-                        dtype="float32",
-                    ),
-                }
-            elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs = tf.keras.Input(
-                    batch_shape=(4, 2, max_input),
-                    name="input_ids",
-                    dtype="int32",
-                )
-            else:
-                inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
-
-            # Prepare our model
-            model = model_class(config)
-            model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
-            # Let's load it from the disk to be sure we can use pretrained weights
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=False)
-                model = model_class.from_pretrained(tmpdirname)
-
-            outputs_dict = model(inputs)
-            hidden_states = outputs_dict[0]
-
-            # Add a dense layer on top to test integration with other keras modules
-            outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
-
-            # Compile extended model
-            extended_model = tf.keras.Model(inputs=[inputs], outputs=[outputs])
-            extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
-    def test_keyword_and_dict_args(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            outputs_dict = model(inputs)
-
-            inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            outputs_keywords = model(**inputs_keywords)
-            output_dict = outputs_dict[0].numpy()
-            output_keywords = outputs_keywords[0].numpy()
-
-            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
-
-    def test_attention_outputs(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-        decoder_seq_length = getattr(
-            self.model_tester,
-            "decoder_seq_length",
-            self.model_tester.seq_length,
+@keras_serializable
+class TFConvNextMainLayer(tf.keras.layers.Layer):
+    config_class = ConvNextConfig
+
+    def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.embeddings = TFConvNextEmbeddings(config, name="embeddings")
+        self.encoder = TFConvNextEncoder(config, name="encoder")
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+        self.pooler = tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None
+
+    def call(
+        self,
+        pixel_values: Optional[TFModelInputType] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        **kwargs,
+    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        encoder_seq_length = getattr(
-            self.model_tester,
-            "encoder_seq_length",
-            self.model_tester.seq_length,
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
         )
-        decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        def check_decoder_attentions_output(outputs):
-            out_len = len(outputs)
-            self.assertEqual(min(out_len % 2, out_len % 5), 0)  # differentiation due to newly added cross_attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    decoder_seq_length,
-                    decoder_key_length,
-                ],
-            )
 
-        def check_encoder_attentions_output(outputs):
-            attentions = [
-                t.numpy() for t in (outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions)
-            ]
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    encoder_seq_length,
-                    encoder_key_length,
-                ],
-            )
+        if "input_ids" in inputs:
+            inputs["pixel_values"] = inputs.pop("input_ids")
 
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["use_cache"] = False
-            config.output_hidden_states = False
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            out_len = len(outputs)
-            self.assertEqual(config.output_hidden_states, False)
-            check_encoder_attentions_output(outputs)
-
-            if self.is_encoder_decoder:
-                model = model_class(config)
-                outputs = model(self._prepare_for_class(inputs_dict, model_class))
-                self.assertEqual(config.output_hidden_states, False)
-                check_decoder_attentions_output(outputs)
-
-            # Check that output attentions can also be changed via the config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            self.assertEqual(config.output_hidden_states, False)
-            check_encoder_attentions_output(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = True
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-
-            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
-            self.assertEqual(model.config.output_hidden_states, True)
-            check_encoder_attentions_output(outputs)
-
-    def test_headmasking(self):
-        if not self.test_head_masking:
-            return
-
-        random.Random().seed(42)
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        random.Random().seed()
-
-        inputs_dict["output_attentions"] = True
-        config.output_hidden_states = True
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-
-            # Prepare head_mask
-            def prepare_layer_head_mask(i, attention_heads, num_hidden_layers):
-                if i == 0:
-                    return tf.concat(
-                        (
-                            tf.zeros(1, dtype=tf.float32),
-                            tf.ones(attention_heads - 1, dtype=tf.float32),
-                        ),
-                        0,
-                    )
-                elif i == num_hidden_layers - 1:
-                    return tf.concat(
-                        (
-                            tf.zeros(attention_heads - 1, dtype=tf.float32),
-                            tf.ones(1, dtype=tf.float32),
-                        ),
-                        0,
-                    )
-                else:
-                    return tf.ones(attention_heads, dtype=tf.float32)
-
-            head_mask = tf.stack(
-                [
-                    prepare_layer_head_mask(i, config.num_attention_heads, config.num_hidden_layers)
-                    for i in range(config.num_hidden_layers)
-                ],
-                0,
-            )
+        if inputs["pixel_values"] is None:
+            raise ValueError("You have to specify pixel_values")
 
-            inputs = self._prepare_for_class(inputs_dict, model_class).copy()
-            inputs["head_mask"] = head_mask
-            if model.config.is_encoder_decoder:
-                signature = inspect.signature(model.call)
-                arg_names = [*signature.parameters.keys()]
-                if "decoder_head_mask" in arg_names:  # necessary diferentiation because of T5 model
-                    inputs["decoder_head_mask"] = head_mask
-                if "cross_attn_head_mask" in arg_names:
-                    inputs["cross_attn_head_mask"] = head_mask
-
-            outputs = model(**inputs, return_dict=True)
-
-            def check_attentions_validity(attentions):
-                # Remove Nan
-                for t in attentions:
-                    self.assertLess(
-                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(),
-                        (tf.size(t) / 4).numpy(),
-                    )  # Check we don't have more than 25% nans (arbitrary)
-
-                attentions = [
-                    tf.where(tf.math.is_nan(t), 0.0, t) for t in attentions
-                ]  # remove them (the test is less complete)
-
-                self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0)
-                self.assertNotEqual(
-                    tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(),
-                    0.0,
-                )
-                if len(attentions) > 2:  # encoder-decodere models have only 2 layers in each modules
-                    self.assertNotEqual(
-                        tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(),
-                        0.0,
-                    )
-                self.assertAlmostEqual(
-                    tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(),
-                    0.0,
-                )
-                self.assertNotEqual(
-                    tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(),
-                    0.0,
-                )
-
-            if model.config.is_encoder_decoder:
-                check_attentions_validity(outputs.encoder_attentions)
-                check_attentions_validity(outputs.decoder_attentions)
-                if "cross_attn_head_mask" in arg_names:
-                    check_attentions_validity(outputs.cross_attentions)
-            else:
-                check_attentions_validity(outputs.attentions)
-
-    def test_hidden_states_output(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_hidden_states_output(config, inputs_dict, model_class):
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            expected_num_layers = getattr(
-                self.model_tester,
-                "expected_num_hidden_layers",
-                self.model_tester.num_hidden_layers + 1,
-            )
+        embedding_output = self.embeddings(inputs["pixel_values"], training=inputs["training"])
 
-            if model.config.is_encoder_decoder:
-                encoder_hidden_states = outputs.encoder_hidden_states
-                decoder_hidden_states = outputs.decoder_hidden_states
-
-                self.assertEqual(config.output_attentions, False)
-                self.assertEqual(len(encoder_hidden_states), expected_num_layers)
-                self.assertListEqual(
-                    list(encoder_hidden_states[0].shape[-2:]),
-                    [
-                        self.model_tester.seq_length,
-                        self.model_tester.hidden_size,
-                    ],
-                )
-                self.assertEqual(len(decoder_hidden_states), expected_num_layers)
-                self.assertListEqual(
-                    list(decoder_hidden_states[0].shape[-2:]),
-                    [
-                        self.model_tester.seq_length,
-                        self.model_tester.hidden_size,
-                    ],
-                )
-            else:
-                hidden_states = outputs.hidden_states
-                self.assertEqual(config.output_attentions, False)
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [
-                        self.model_tester.seq_length,
-                        self.model_tester.hidden_size,
-                    ],
-                )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(config, inputs_dict, model_class)
-
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-            check_hidden_states_output(config, inputs_dict, model_class)
-
-    def test_model_common_attributes(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        text_in_text_out_models = (
-            get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
-            + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
-            + get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=inputs["training"],
         )
-        speech_in_text_out_models = get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-            if model_class in text_in_text_out_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            elif model_class in speech_in_text_out_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
-    def test_determinism(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            first, second = (
-                model(
-                    self._prepare_for_class(inputs_dict, model_class),
-                    training=False,
-                )[0],
-                model(
-                    self._prepare_for_class(inputs_dict, model_class),
-                    training=False,
-                )[0],
-            )
-            out_1 = first.numpy()
-            out_2 = second.numpy()
-            out_1 = out_1[~np.isnan(out_1)]
-            out_2 = out_2[~np.isnan(out_2)]
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-    def test_model_outputs_equivalence(self):
-
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
-            dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-            def recursive_check(tuple_object, dict_object):
-                if isinstance(tuple_object, (List, Tuple)):
-                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif tuple_object is None:
-                    return
-                else:
-                    self.assertTrue(
-                        all(tf.equal(tuple_object, dict_object)),
-                        msg=f"Tuple and dict output are not equal. Difference: {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}",
-                    )
-
-                recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(
-                model,
-                tuple_inputs,
-                dict_inputs,
-                {"output_hidden_states": True, "output_attentions": True},
-            )
 
-    def test_inputs_embeds(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            inputs = copy.deepcopy(inputs_dict)
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids)
-            else:
-                inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids)
-
-            inputs = self._prepare_for_class(inputs, model_class)
-
-            model(inputs)
-
-    def test_numpy_arrays_inputs(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def prepare_numpy_arrays(inputs_dict):
-            inputs_np_dict = {}
-            for k, v in inputs_dict.items():
-                if tf.is_tensor(v):
-                    inputs_np_dict[k] = v.numpy()
-                else:
-                    inputs_np_dict[k] = np.array(k)
-
-            return inputs_np_dict
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            inputs_np = prepare_numpy_arrays(inputs)
-
-            output_for_dict_input = model(inputs_np)
-            output_for_kw_input = model(**inputs_np)
-            self.assert_outputs_same(output_for_dict_input, output_for_kw_input)
-
-    def test_resize_token_embeddings(self):
-        if not self.test_resize_embeddings:
-            return
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def _get_word_embedding_weight(model, embedding_layer):
-            embeds = getattr(embedding_layer, "weight", None)
-            if embeds is not None:
-                return embeds
-
-            embeds = getattr(embedding_layer, "decoder", None)
-            if embeds is not None:
-                return embeds
-
-            model(model.dummy_inputs)
-
-            embeds = getattr(embedding_layer, "weight", None)
-            if embeds is not None:
-                return embeds
-
-            embeds = getattr(embedding_layer, "decoder", None)
-            if embeds is not None:
-                return embeds
-
-            return None
-
-        for model_class in self.all_model_classes:
-            for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
-                # build the embeddings
-                model = model_class(config=config)
-                old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
-                old_bias = model.get_bias()
-                old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
-                # reshape the embeddings
-                model.resize_token_embeddings(size)
-                new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
-                new_bias = model.get_bias()
-                new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
-
-                # check that the resized embeddings size matches the desired size.
-                assert_size = size if size is not None else config.vocab_size
-                self.assertEqual(new_input_embeddings.shape[0], assert_size)
-
-                # check that weights remain the same after resizing
-                models_equal = True
-                for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
-                    if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
-                        models_equal = False
-                self.assertTrue(models_equal)
-
-                if old_bias is not None and new_bias is not None:
-                    for old_weight, new_weight in zip(old_bias.values(), new_bias.values()):
-                        self.assertEqual(new_weight.shape[0], assert_size)
-
-                        models_equal = True
-                        for p1, p2 in zip(old_weight.value(), new_weight.value()):
-                            if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
-                                models_equal = False
-                        self.assertTrue(models_equal)
-
-                if old_output_embeddings is not None and new_output_embeddings is not None:
-                    self.assertEqual(new_output_embeddings.shape[0], assert_size)
-                    self.assertEqual(
-                        new_output_embeddings.shape[1],
-                        old_output_embeddings.shape[1],
-                    )
-
-                    models_equal = True
-                    for p1, p2 in zip(
-                        old_output_embeddings.value(),
-                        new_output_embeddings.value(),
-                    ):
-                        if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
-                            models_equal = False
-                    self.assertTrue(models_equal)
-
-    def test_lm_head_model_random_no_beam_search_generate(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_ids
-                with self.assertRaises(AssertionError):
-                    model.generate(do_sample=True, max_length=5)
-                # num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_ids, do_sample=True))
-            elif model_class.__name__ not in ["TFSpeech2TextForConditionalGeneration"]:
-                # Models with non-text inputs won't work here; num_return_sequences = 1
-                self._check_generated_ids(model.generate(do_sample=True, max_length=5))
-
-            with self.assertRaises(ValueError):
-                # generating multiple sequences when no beam search generation
-                # is not allowed as it would always generate the same sequences
-                model.generate(input_ids, do_sample=False, num_return_sequences=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(model.generate(input_ids, do_sample=True, num_return_sequences=2))
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [
-                self._generate_random_bad_tokens(1, model),
-                self._generate_random_bad_tokens(2, model),
-            ]
-            output_tokens = model.generate(
-                input_ids,
-                do_sample=True,
-                bad_words_ids=bad_words_ids,
-                num_return_sequences=2,
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_ids.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
-    def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-        if input_ids is None:
-            input_ids = inputs_dict.get("input_features", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-            output_greedy = model.generate(
-                input_ids,
-                do_sample=False,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-            output_sample = model.generate(
-                input_ids,
-                do_sample=True,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = self.layernorm(self.pooler(last_hidden_state))
 
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_greedy, TFGreedySearchEncoderDecoderOutput)
-                self.assertIsInstance(output_sample, TFSampleEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_greedy, TFGreedySearchDecoderOnlyOutput)
-                self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput)
-
-    def test_lm_head_model_random_beam_search_generate(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_ids, num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2))
-            else:
-                # num_return_sequences = 1
-                self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2))
-
-            with self.assertRaises(AssertionError):
-                # generating more sequences than having beams leads is not possible
-                model.generate(
-                    input_ids,
-                    do_sample=False,
-                    num_return_sequences=3,
-                    num_beams=2,
-                )
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(
-                model.generate(
-                    input_ids,
-                    do_sample=True,
-                    num_beams=2,
-                    num_return_sequences=2,
-                )
-            )
-            # num_return_sequences > 1, greedy
-            self._check_generated_ids(
-                model.generate(
-                    input_ids,
-                    do_sample=False,
-                    num_beams=2,
-                    num_return_sequences=2,
-                )
-            )
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
 
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [
-                self._generate_random_bad_tokens(1, model),
-                self._generate_random_bad_tokens(2, model),
-            ]
-            output_tokens = model.generate(
-                input_ids,
-                do_sample=False,
-                bad_words_ids=bad_words_ids,
-                num_beams=2,
-                num_return_sequences=2,
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_ids.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
-    def test_lm_head_model_beam_search_generate_dict_outputs(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-        if input_ids is None:
-            input_ids = inputs_dict.get("input_features", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-            output_beam_search = model.generate(
-                input_ids,
-                num_beams=2,
-                do_sample=False,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-            output_beam_sample = model.generate(
-                input_ids,
-                num_beams=2,
-                do_sample=True,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
 
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_beam_search, TFBeamSearchEncoderDecoderOutput)
-                self.assertIsInstance(output_beam_sample, TFBeamSampleEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_beam_search, TFBeamSearchDecoderOnlyOutput)
-                self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput)
-
-    def test_loss_computation(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            if getattr(model, "hf_compute_loss", None):
-                # The number of elements in the loss should be the same as the number of elements in the label
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                added_label = prepared_for_class[
-                    sorted(
-                        list(prepared_for_class.keys() - inputs_dict.keys()),
-                        reverse=True,
-                    )[0]
-                ]
-                loss_size = tf.size(added_label)
-
-                if model.__class__ in get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING):
-                    # if loss is causal lm loss, labels are shift, so that one label per batch
-                    # is cut
-                    loss_size = loss_size - self.model_tester.batch_size
-
-                # Test that model correctly compute the loss with kwargs
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                possible_input_names = {
-                    "input_ids",
-                    "pixel_values",
-                    "input_features",
-                }
-                input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
-                model_input = prepared_for_class.pop(input_name)
-
-                loss = model(model_input, **prepared_for_class)[0]
-                self.assertEqual(loss.shape, [loss_size])
-
-                # Test that model correctly compute the loss with a dict
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                loss = model(prepared_for_class)[0]
-                self.assertEqual(loss.shape, [loss_size])
-
-                # Test that model correctly compute the loss with a tuple
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-
-                # Get keys that were added with the _prepare_for_class function
-                label_keys = prepared_for_class.keys() - inputs_dict.keys()
-                signature = inspect.signature(model.call).parameters
-                signature_names = list(signature.keys())
-
-                # Create a dictionary holding the location of the tensors in the tuple
-                tuple_index_mapping = {0: input_name}
-                for label_key in label_keys:
-                    label_key_index = signature_names.index(label_key)
-                    tuple_index_mapping[label_key_index] = label_key
-                sorted_tuple_index_mapping = sorted(tuple_index_mapping.items())
-                # Initialize a list with their default values, update the values and convert to a tuple
-                list_input = []
-
-                for name in signature_names:
-                    if name != "kwargs":
-                        list_input.append(signature[name].default)
-
-                for index, value in sorted_tuple_index_mapping:
-                    list_input[index] = prepared_for_class[value]
-
-                tuple_input = tuple(list_input)
-
-                # Send to model
-                loss = model(tuple_input[:-1])[0]
-
-                self.assertEqual(loss.shape, [loss_size])
-
-    def test_generate_with_headmasking(self):
-        attention_names = [
-            "encoder_attentions",
-            "decoder_attentions",
-            "cross_attentions",
-        ]
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            # We want to test only encoder-decoder models
-            if not config.is_encoder_decoder:
-                continue
-
-            head_masking = {
-                "head_mask": tf.zeros((config.encoder_layers, config.encoder_attention_heads)),
-                "decoder_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
-                "cross_attn_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
-            }
 
-            signature = inspect.signature(model.call)
-            if set(head_masking.keys()) < set([*signature.parameters.keys()]):
-                continue
-
-            for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
-                out = model.generate(
-                    inputs_dict["input_ids"],
-                    num_beams=1,
-                    max_length=inputs_dict["input_ids"] + 5,
-                    output_attentions=True,
-                    return_dict_in_generate=True,
-                    **{name: mask},
-                )
-                # We check the state of decoder_attentions and cross_attentions just from the last step
-                attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
-                self.assertEqual(sum([tf.reduce_sum(w).numpy() for w in attn_weights]), 0.0)
-
-    def test_load_with_mismatched_shapes(self):
-        if not self.test_mismatched_shapes:
-            return
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
-                continue
-
-            with self.subTest(msg=f"Testing {model_class}"):
-                with tempfile.TemporaryDirectory() as tmp_dir:
-                    model = model_class(config)
-                    inputs = self._prepare_for_class(inputs_dict, model_class)
-                    _ = model(**inputs)
-                    model.save_pretrained(tmp_dir)
-
-                    # Fails when we don't set ignore_mismatched_sizes=True
-                    with self.assertRaises(ValueError):
-                        new_model = TFAutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42)
-                    with self.assertRaises(ValueError):
-                        new_model_without_prefix = TFAutoModel.from_pretrained(tmp_dir, vocab_size=10)
-
-                    logger = logging.get_logger("transformers.modeling_tf_utils")
-                    with CaptureLogger(logger) as cl:
-                        new_model = TFAutoModelForSequenceClassification.from_pretrained(
-                            tmp_dir, num_labels=42, ignore_mismatched_sizes=True
-                        )
-                    self.assertIn("the shapes did not match", cl.out)
-
-                    logits = new_model(**inputs).logits
-                    self.assertEqual(logits.shape[1], 42)
-
-                    with CaptureLogger(logger) as cl:
-                        new_model_without_prefix = TFAutoModel.from_pretrained(
-                            tmp_dir, vocab_size=10, ignore_mismatched_sizes=True
-                        )
-                    self.assertIn("the shapes did not match", cl.out)
-
-                    # Although Tf models always have a prefix pointing to `MainLayer`,
-                    # we still add this "without prefix" test to keep a consistency between tf and pt tests.
-                    input_ids = ids_tensor((2, 8), 10)
-                    if self.is_encoder_decoder:
-                        new_model_without_prefix(input_ids, decoder_input_ids=input_ids)
-                    else:
-                        new_model_without_prefix(input_ids)
-
-    def test_model_main_input_name(self):
-        for model_class in self.all_model_classes:
-            model_signature = inspect.signature(getattr(model_class, "call"))
-            # The main input is the name of the argument after `self`
-            observed_main_input_name = list(model_signature.parameters.keys())[1]
-            self.assertEqual(model_class.main_input_name, observed_main_input_name)
-
-    def _generate_random_bad_tokens(self, num_bad_tokens, model):
-        # special tokens cannot be bad tokens
-        special_tokens = []
-        if model.config.bos_token_id is not None:
-            special_tokens.append(model.config.bos_token_id)
-        if model.config.pad_token_id is not None:
-            special_tokens.append(model.config.pad_token_id)
-        if model.config.eos_token_id is not None:
-            special_tokens.append(model.config.eos_token_id)
-
-        # create random bad tokens that are not special tokens
-        bad_tokens = []
-        while len(bad_tokens) < num_bad_tokens:
-            token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0]
-            if token not in special_tokens:
-                bad_tokens.append(token)
-        return bad_tokens
-
-    def _check_generated_ids(self, output_ids):
-        for token_id in output_ids[0].numpy().tolist():
-            self.assertGreaterEqual(token_id, 0)
-            self.assertLess(token_id, self.model_tester.vocab_size)
-
-    def _check_match_tokens(self, generated_ids, bad_words_ids):
-        # for all bad word tokens
-        for bad_word_ids in bad_words_ids:
-            # for all slices in batch
-            for generated_ids_slice in generated_ids:
-                # for all word idx
-                for i in range(len(bad_word_ids), len(generated_ids_slice)):
-                    # if tokens match
-                    if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids:
-                        return True
-        return False
-
-
-def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
-    """Creates a random int32 tensor of the shape within the vocab size."""
-    if rng is None:
-        rng = random.Random()
-
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.randint(0, vocab_size - 1))
-
-    output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32)
-
-    return output
-
-
-def random_attention_mask(shape, rng=None, name=None, dtype=None):
-    attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype)
-    # make sure that at least one token is attended to for each batch
-    attn_mask = tf.concat(
-        [
-            tf.constant(value=1, shape=(shape[0], 1), dtype=dtype),
-            attn_mask[:, 1:],
-        ],
-        axis=1,
+class TFConvNextPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ConvNextConfig
+    base_model_prefix = "convnext"
+    main_input_name = "pixel_values"
+
+    @property
+    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
+        """
+        Dummy inputs to build the network.
+
+        Returns:
+            `Dict[str, tf.Tensor]`: The dummy inputs.
+        """
+        VISION_DUMMY_INPUTS = tf.random.uniform(
+            shape=(
+                3,
+                self.config.num_channels,
+                self.config.image_size,
+                self.config.image_size,
+            ),
+            dtype=tf.float32,
+        )
+        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
+
+    @tf.function(
+        input_signature=[
+            {
+                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
+            }
+        ]
     )
-    return attn_mask
+    def serving(self, inputs):
+        """
+        Method used for serving the model.
 
+        Args:
+            inputs (`Dict[str, tf.Tensor]`):
+                The input of the saved model as a dictionary of tensors.
+        """
+        return self.call(inputs)
 
-def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = random.Random()
 
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
+CONVNEXT_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
 
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.random() * scale)
+    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
 
-    return tf.reshape(
-        tf.constant(values, dtype=dtype if dtype is not None else tf.float32),
-        shape=shape,
-    )
+    <Tip>
 
+    TF 2.0 models accepts two formats as inputs:
 
-@require_tf
-class UtilsFunctionsTest(unittest.TestCase):
-
-    # tests whether the top_k_top_p_filtering function behaves as expected
-    def test_top_k_top_p_filtering(self):
-        logits = tf.convert_to_tensor(
-            [
-                [
-                    8.2220991,  # 3rd highest value; idx. 0
-                    -0.5620044,
-                    5.23229752,
-                    4.0386393,
-                    -6.8798378,
-                    -0.54785802,
-                    -3.2012153,
-                    2.92777176,
-                    1.88171953,
-                    7.35341276,  # 5th highest value; idx. 9
-                    8.43207833,  # 2nd highest value; idx. 10
-                    -9.85711836,
-                    -5.96209236,
-                    -1.13039161,
-                    -7.1115294,
-                    -0.8369633,
-                    -5.3186408,
-                    7.06427407,
-                    0.81369344,
-                    -0.82023817,
-                    -5.9179796,
-                    0.58813443,
-                    -6.99778438,
-                    4.71551189,
-                    -0.18771637,
-                    7.44020759,  # 4th highest value; idx. 25
-                    9.38450987,  # 1st highest value; idx. 26
-                    2.12662941,
-                    -9.32562038,
-                    2.35652522,
-                ],  # cummulative prob of 5 highest values <= 0.6
-                [
-                    0.58425518,
-                    4.53139238,
-                    -5.57510464,
-                    -6.28030699,
-                    -7.19529503,
-                    -4.02122551,
-                    1.39337037,
-                    -6.06707057,
-                    1.59480517,
-                    -9.643119,
-                    0.03907799,
-                    0.67231762,
-                    -8.88206726,
-                    6.27115922,  # 4th highest value; idx. 13
-                    2.28520723,
-                    4.82767506,
-                    4.30421368,
-                    8.8275313,  # 2nd highest value; idx. 17
-                    5.44029958,  # 5th highest value; idx. 18
-                    -4.4735794,
-                    7.38579536,  # 3rd highest value; idx. 20
-                    -2.91051663,
-                    2.61946077,
-                    -2.5674762,
-                    -9.48959302,
-                    -4.02922645,
-                    -1.35416918,
-                    9.67702323,  # 1st highest value; idx. 27
-                    -5.89478553,
-                    1.85370467,
-                ],  # cummulative prob of 5 highest values <= 0.6
-            ],
-            dtype=tf.float32,
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional arguments.
+
+    This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the
+    tensors in the first argument of the model call function: `model(inputs)`.
+
+    </Tip>
+
+    Parameters:
+        config ([`ConvNextConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CONVNEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`ConvNextFeatureExtractor`]. See
+            [`ConvNextFeatureExtractor.__call__`] for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used
+            in eager mode, in graph mode the value will always be set to True.
+"""
+
+
+@add_start_docstrings(
+    "The bare ConvNext model outputting raw features without any specific head on top.",
+    CONVNEXT_START_DOCSTRING,
+)
+class TFConvNextModel(TFConvNextPreTrainedModel):
+    def __init__(self, config, *inputs, add_pooling_layer=True, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.convnext = TFConvNextMainLayer(config, add_pooling_layer=add_pooling_layer, name="convnext")
+
+    @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        pixel_values: Optional[TFModelInputType] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        **kwargs,
+    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConvNextFeatureExtractor, TFConvNextModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224")
+        >>> model = TFConvNextModel.from_pretrained("facebook/convnext-tiny-224")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="tf")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
         )
 
-        non_inf_expected_idx = tf.convert_to_tensor(
-            [
-                [0, 0],
-                [0, 9],
-                [0, 10],
-                [0, 25],
-                [0, 26],
-                [1, 13],
-                [1, 17],
-                [1, 18],
-                [1, 20],
-                [1, 27],
-            ],
-            dtype=tf.int32,
-        )  # expected non filtered idx as noted above
-
-        non_inf_expected_output = tf.convert_to_tensor(
-            [
-                8.222099,
-                7.3534126,
-                8.432078,
-                7.4402075,
-                9.38451,
-                6.271159,
-                8.827531,
-                5.4402995,
-                7.3857956,
-                9.677023,
-            ],
-            dtype=tf.float32,
-        )  # expected non filtered values as noted above
+        if "input_ids" in inputs:
+            inputs["pixel_values"] = inputs.pop("input_ids")
 
-        output = tf_top_k_top_p_filtering(logits, top_k=10, top_p=0.6, min_tokens_to_keep=4)
+        if inputs["pixel_values"] is None:
+            raise ValueError("You have to specify pixel_values")
 
-        non_inf_output = output[output != -float("inf")]
-        non_inf_idx = tf.cast(
-            tf.where(tf.not_equal(output, tf.constant(-float("inf"), dtype=tf.float32))),
-            dtype=tf.int32,
+        outputs = self.convnext(
+            pixel_values=inputs["pixel_values"],
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=inputs["training"],
         )
 
-        tf.debugging.assert_near(non_inf_output, non_inf_expected_output, rtol=1e-12)
-        tf.debugging.assert_equal(non_inf_idx, non_inf_expected_idx)
-
-
-@require_tf
-@is_staging_test
-class TFModelPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = login(username=USER, password=PASS)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, name="test-model-tf")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(
-                token=cls._token,
-                name="test-model-tf-org",
-                organization="valid_org",
-            )
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        config = BertConfig(
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
+        # converts back NHWC -> NCHW, to match PT's output
+        if not return_dict:
+            return (tf.transpose(outputs[0], perm=(0, 3, 1, 2)),) + outputs[1:]
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=tf.transpose(outputs.last_hidden_state, perm=(0, 3, 1, 2)),
+            pooler_output=outputs.pooler_output,
+            hidden_states=outputs.hidden_states,
         )
-        model = TFBertModel(config)
-        # Make sure model is properly initialized
-        _ = model(model.dummy_inputs)
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                os.path.join(tmp_dir, "test-model-tf"),
-                push_to_hub=True,
-                use_auth_token=self._token,
-            )
 
-            new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
-            models_equal = True
-            for p1, p2 in zip(model.weights, new_model.weights):
-                if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
-                    models_equal = False
-            self.assertTrue(models_equal)
-
-    def test_push_to_hub_with_model_card(self):
-        config = BertConfig(
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
+
+@add_start_docstrings(
+    """
+    ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    CONVNEXT_START_DOCSTRING,
+)
+class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config: ConvNextConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.num_labels = config.num_labels
+        self.convnext = TFConvNextMainLayer(config, name="convnext")
+
+        # Classifier head
+        self.classifier = tf.keras.layers.Dense(
+            units=config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer="zeros",
+            name="classifier",
         )
-        model = TFBertModel(config)
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.push_to_hub(os.path.join(tmp_dir, "test-model-tf"))
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "test-model-card-tf", "README.md")))
-
-    def test_push_to_hub_in_organization(self):
-        config = BertConfig(
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
+
+    @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        pixel_values: Optional[TFModelInputType] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        training: Optional[bool] = False,
+        **kwargs,
+    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConvNextFeatureExtractor, TFConvNextForImageClassification
+        >>> import tensorflow as tf
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224")
+        >>> model = TFViTForImageClassification.from_pretrained("facebook/convnext-tiny-224")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="tf")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        >>> # model predicts one of the 1000 ImageNet classes
+        >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
+        >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
+        ```"""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            labels=labels,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        if "input_ids" in inputs:
+            inputs["pixel_values"] = inputs.pop("input_ids")
+
+        if inputs["pixel_values"] is None:
+            raise ValueError("You have to specify pixel_values")
+
+        outputs = self.convnext(
+            inputs["pixel_values"],
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=inputs["training"],
         )
-        model = TFBertModel(config)
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                os.path.join(tmp_dir, "test-model-tf-org"),
-                push_to_hub=True,
-                use_auth_token=self._token,
-                organization="valid_org",
-            )
 
-            new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
-            models_equal = True
-            for p1, p2 in zip(model.weights, new_model.weights):
-                if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
-                    models_equal = False
-            self.assertTrue(models_equal)
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output)
+        loss = None if inputs["labels"] is None else self.hf_compute_loss(labels=inputs["labels"], logits=logits)
+
+        if not inputs["return_dict"]:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
diff --git a/tests/convnext/test_modeling_tf_convnext.py b/tests/convnext/test_modeling_tf_convnext.py
index 6f8c142b654d8..cfc2646176448 100644
--- a/tests/convnext/test_modeling_tf_convnext.py
+++ b/tests/convnext/test_modeling_tf_convnext.py
@@ -22,8 +22,8 @@
 from transformers.file_utils import cached_property, is_tf_available, is_vision_available
 from transformers.testing_utils import require_tf, require_vision, slow
 
-from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ..test_configuration_common import ConfigTester
+from ..test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
 
 
 if is_tf_available():

From 3e069429bf6bb0a5e6a54de2fb5e2443b82f5b89 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 24 Feb 2022 13:41:36 +0530
Subject: [PATCH 52/65] chore: applied  sgugger's suggestion for dealing w/
 output_attentions.

---
 src/transformers/modeling_tf_utils.py | 122 +++++++++++++++++++++-----
 1 file changed, 98 insertions(+), 24 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index f85f3aaa8e028..d2e5a5ba9ca9d 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -312,7 +312,7 @@ def booleans_processing(config, **kwargs):
 
     if tf.executing_eagerly():
         final_booleans["output_attentions"] = kwargs.get("output_attentions", None)
-        if not final_booleans["output_attentions"]:
+        if final_booleans["output_attentions"] is None:
             final_booleans["output_attentions"] = config.output_attentions
 
         final_booleans["output_hidden_states"] = (
@@ -366,7 +366,17 @@ def input_processing(func, config, input_ids, **kwargs):
     signature.pop("self", None)
     parameter_names = list(signature.keys())
     output = {}
-    allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray, KerasTensor)
+    allowed_types = (
+        tf.Tensor,
+        bool,
+        int,
+        ModelOutput,
+        tuple,
+        list,
+        dict,
+        np.ndarray,
+        KerasTensor,
+    )
 
     if "inputs" in kwargs["kwargs_call"]:
         warnings.warn(
@@ -479,7 +489,13 @@ def input_processing(func, config, input_ids, **kwargs):
     boolean_dict = {
         k: v
         for k, v in output.items()
-        if k in ["return_dict", "output_attentions", "output_hidden_states", "use_cache"]
+        if k
+        in [
+            "return_dict",
+            "output_attentions",
+            "output_hidden_states",
+            "use_cache",
+        ]
     }
 
     output.update(
@@ -578,11 +594,18 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False,
                             # If yes we reshape the weight from the H5 file accordingly to the current weight
                             # If the two shapes are not compatible we raise an issue
                             try:
-                                array = np.reshape(saved_weight_value, K.int_shape(symbolic_weight))
+                                array = np.reshape(
+                                    saved_weight_value,
+                                    K.int_shape(symbolic_weight),
+                                )
                             except ValueError as e:
                                 if ignore_mismatched_sizes:
                                     mismatched_layers.append(
-                                        (symbolic_weight_name, saved_weight_value.shape, K.int_shape(symbolic_weight))
+                                        (
+                                            symbolic_weight_name,
+                                            saved_weight_value.shape,
+                                            K.int_shape(symbolic_weight),
+                                        )
                                     )
                                     continue
                                 else:
@@ -626,11 +649,17 @@ def init_copy_embeddings(old_embeddings, new_num_tokens):
         # and we create a mask to properly identify the padded values and be replaced by the values of the newly created
         # embeddings
         current_weights = tf.pad(
-            old_embeddings.value(), tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=-1
+            old_embeddings.value(),
+            tf.convert_to_tensor([[0, size_diff], [0, 0]]),
+            constant_values=-1,
         )
         num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
         mask = tf.fill(tf.convert_to_tensor([num_tokens_to_copy, 1]), True)
-        mask = tf.pad(mask, tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=False)
+        mask = tf.pad(
+            mask,
+            tf.convert_to_tensor([[0, size_diff], [0, 0]]),
+            constant_values=False,
+        )
     else:
         # if the new size if lower than the old one, we take the current embeddings until the new size
         current_weights = tf.slice(
@@ -775,7 +804,10 @@ def _save_checkpoint(self, checkpoint_dir, epoch):
         # internally and which users are likely to use too
         weights_path = os.path.join(checkpoint_dir, "weights.h5")
         self.save_weights(weights_path)
-        extra_data = {"epoch": epoch, "optimizer_state": self.optimizer.get_weights()}
+        extra_data = {
+            "epoch": epoch,
+            "optimizer_state": self.optimizer.get_weights(),
+        }
         extra_data_path = os.path.join(checkpoint_dir, "extra_data.pickle")
         with open(extra_data_path, "wb") as f:
             pickle.dump(extra_data, f)
@@ -801,7 +833,10 @@ def load_repo_checkpoint(self, repo_path_or_name):
         if not os.path.isdir(repo_path_or_name):
             # If this isn't a local path, check that the remote repo exists and has a checkpoint in it
             repo_files = list_repo_files(repo_path_or_name)
-            for file in ("checkpoint/weights.h5", "checkpoint/extra_data.pickle"):
+            for file in (
+                "checkpoint/weights.h5",
+                "checkpoint/extra_data.pickle",
+            ):
                 if file not in repo_files:
                     raise FileNotFoundError(f"Repo {repo_path_or_name} does not contain checkpoint file {file}!")
             if "/" not in repo_path_or_name:
@@ -809,7 +844,10 @@ def load_repo_checkpoint(self, repo_path_or_name):
                 repo_path_or_name = self.get_full_repo_name(repo_path_or_name)
             else:
                 model_id = repo_path_or_name.split("/")[-1]
-            repo = Repository(model_id, clone_from=f"https://huggingface.co/{repo_path_or_name}")
+            repo = Repository(
+                model_id,
+                clone_from=f"https://huggingface.co/{repo_path_or_name}",
+            )
             local_dir = repo.local_dir
         else:
             local_dir = repo_path_or_name
@@ -1066,7 +1104,8 @@ def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]:
             `tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
         """
         warnings.warn(
-            "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning
+            "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.",
+            FutureWarning,
         )
         return self.get_lm_head()
 
@@ -1077,7 +1116,10 @@ def get_prefix_bias_name(self) -> Union[None, str]:
         Return:
             `str`: The _prefix name of the bias.
         """
-        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        warnings.warn(
+            "The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.",
+            FutureWarning,
+        )
         return None
 
     def get_bias(self) -> Union[None, Dict[str, tf.Variable]]:
@@ -1225,15 +1267,25 @@ def _get_resized_lm_head_bias(self, old_lm_head_bias, new_num_tokens):
             # initialize new bias
             if tf.math.greater(size_diff, 0):
                 padding_shape = [[0, size_diff]] if first_dim is None else [[0, 0], [0, size_diff]]
-                current_bias = tf.pad(weight.value(), tf.convert_to_tensor(padding_shape), constant_values=-1)
+                current_bias = tf.pad(
+                    weight.value(),
+                    tf.convert_to_tensor(padding_shape),
+                    constant_values=-1,
+                )
                 num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
                 mask_shape = [num_tokens_to_copy] if first_dim is None else [1, num_tokens_to_copy]
                 bias_mask = tf.fill(tf.convert_to_tensor(mask_shape), True)
-                bias_mask = tf.pad(bias_mask, tf.convert_to_tensor(padding_shape), constant_values=False)
+                bias_mask = tf.pad(
+                    bias_mask,
+                    tf.convert_to_tensor(padding_shape),
+                    constant_values=False,
+                )
             else:
                 slice_from = [0] if first_dim is None else [0, 0]
                 current_bias = tf.slice(
-                    weight.value(), tf.convert_to_tensor(slice_from), tf.convert_to_tensor(final_shape)
+                    weight.value(),
+                    tf.convert_to_tensor(slice_from),
+                    tf.convert_to_tensor(final_shape),
                 )
                 bias_mask = tf.fill(tf.convert_to_tensor(final_shape), True)
 
@@ -1374,7 +1426,11 @@ def save_pretrained(self, save_directory, saved_model=False, version=1, push_to_
 
         if saved_model:
             saved_model_dir = os.path.join(save_directory, "saved_model", str(version))
-            self.save(saved_model_dir, include_optimizer=False, signatures=self.serving)
+            self.save(
+                saved_model_dir,
+                include_optimizer=False,
+                signatures=self.serving,
+            )
             logger.info(f"Saved model created in {saved_model_dir}")
 
         # Save configuration file
@@ -1526,7 +1582,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
 
-        user_agent = {"file_type": "model", "framework": "tensorflow", "from_auto_class": from_auto_class}
+        user_agent = {
+            "file_type": "model",
+            "framework": "tensorflow",
+            "from_auto_class": from_auto_class,
+        }
         if from_pipeline is not None:
             user_agent["using_pipeline"] = from_pipeline
 
@@ -1622,7 +1682,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                         "proxies": proxies,
                         "use_auth_token": use_auth_token,
                     }
-                    if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs):
+                    if has_file(
+                        pretrained_model_name_or_path,
+                        WEIGHTS_NAME,
+                        **has_file_kwargs,
+                    ):
                         raise EnvironmentError(
                             f"{pretrained_model_name_or_path} does not appear to have a file named {TF2_WEIGHTS_NAME} "
                             "but there is a file for PyTorch weights. Use `from_pt=True` to load this model from "
@@ -1772,7 +1836,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 # To update the docstring, we need to copy the method, otherwise we change the original docstring.
 TFPreTrainedModel.push_to_hub = copy_func(TFPreTrainedModel.push_to_hub)
 TFPreTrainedModel.push_to_hub.__doc__ = TFPreTrainedModel.push_to_hub.__doc__.format(
-    object="model", object_class="TFAutoModel", object_files="model checkpoint"
+    object="model",
+    object_class="TFAutoModel",
+    object_files="model checkpoint",
 )
 
 
@@ -1801,7 +1867,9 @@ def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
 
     def build(self, input_shape):
         self.weight = self.add_weight(
-            "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range)
+            "weight",
+            shape=[self.nx, self.nf],
+            initializer=get_initializer(self.initializer_range),
         )
         self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer())
 
@@ -1839,7 +1907,7 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optiona
         super().__init__(**kwargs)
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
-        self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
+        self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
 
     def build(self, input_shape):
         """
@@ -1847,7 +1915,9 @@ def build(self, input_shape):
         https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
         """
         self.weight = self.add_weight(
-            "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
+            "weight",
+            shape=[self.vocab_size, self.hidden_size],
+            initializer=get_initializer(self.initializer_range),
         )
         super().build(input_shape)
 
@@ -1961,7 +2031,9 @@ def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **
             else:
                 num_classes = config.hidden_size
             self.summary = tf.keras.layers.Dense(
-                num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
+                num_classes,
+                kernel_initializer=get_initializer(initializer_range),
+                name="summary",
             )
 
         self.has_activation = False
@@ -2056,7 +2128,9 @@ def register_for_auto_class(cls, auto_class="TFAutoModel"):
         cls._auto_class = auto_class
 
 
-def get_initializer(initializer_range: float = 0.02) -> tf.initializers.TruncatedNormal:
+def get_initializer(
+    initializer_range: float = 0.02,
+) -> tf.initializers.TruncatedNormal:
     """
     Creates a `tf.initializers.TruncatedNormal` with the given range.
 

From bc46016955eb70a6e2f6ea31e938f66e738b60af Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 24 Feb 2022 13:43:19 +0530
Subject: [PATCH 53/65] chore: added comments.

---
 src/transformers/modeling_tf_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index d2e5a5ba9ca9d..9d392ec6e4ff0 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -311,6 +311,7 @@ def booleans_processing(config, **kwargs):
     final_booleans = {}
 
     if tf.executing_eagerly():
+        # Pure conv models (such as ConvNext) do not have `output_attentions`
         final_booleans["output_attentions"] = kwargs.get("output_attentions", None)
         if final_booleans["output_attentions"] is None:
             final_booleans["output_attentions"] = config.output_attentions

From 06e19cd3d9d9b0533b113cfb4dca59330c0d1a89 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 24 Feb 2022 15:07:57 +0530
Subject: [PATCH 54/65] chore: applied updated quality enviornment style.

---
 src/transformers/modeling_tf_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 9d392ec6e4ff0..4637130e7771c 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -1908,7 +1908,7 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optiona
         super().__init__(**kwargs)
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
-        self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
+        self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
 
     def build(self, input_shape):
         """

From 229a817ad8d9eb004fa11f548c483ad26a3a4283 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 24 Feb 2022 15:10:24 +0530
Subject: [PATCH 55/65] chore: applied formatting with quality enviornment.

---
 tests/test_modeling_common.py | 3066 ++++++++++++++-------------------
 1 file changed, 1298 insertions(+), 1768 deletions(-)

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 17888bcfac380..bf707b762c394 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -13,386 +13,196 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 import copy
-import gc
 import inspect
 import json
 import os
-import os.path
 import random
-import sys
 import tempfile
 import unittest
-import warnings
-from pathlib import Path
-from typing import Dict, List, Tuple
-
-import numpy as np
+from importlib import import_module
+from typing import List, Tuple
 
-import transformers
-from huggingface_hub import Repository, delete_repo, login
+from huggingface_hub import delete_repo, login
 from requests.exceptions import HTTPError
-from transformers import (
-    AutoConfig,
-    AutoModel,
-    AutoModelForSequenceClassification,
-    PretrainedConfig,
-    is_torch_available,
-    logging,
-)
-from transformers.file_utils import WEIGHTS_NAME, is_flax_available, is_torch_fx_available
+from transformers import is_tf_available
 from transformers.models.auto import get_values
+from transformers.testing_utils import tooslow  # noqa: F401
 from transformers.testing_utils import (
     PASS,
     USER,
     CaptureLogger,
-    TestCasePlus,
-    is_pt_flax_cross_test,
+    _tf_gpu_memory_limit,
     is_pt_tf_cross_test,
     is_staging_test,
-    require_torch,
-    require_torch_multi_gpu,
+    require_tf,
+    require_tf2onnx,
     slow,
-    torch_device,
 )
+from transformers.utils import logging
 
 
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_configuration import CustomConfig, NoSuperInitConfig  # noqa E402
+if is_tf_available():
+    import numpy as np
+    import tensorflow as tf
 
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from test_module.custom_modeling import CustomModel, NoSuperInitModel
     from transformers import (
-        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
-        MODEL_FOR_CAUSAL_LM_MAPPING,
-        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
-        MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
-        MODEL_FOR_MASKED_LM_MAPPING,
-        MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
-        MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
-        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        MODEL_MAPPING,
-        AdaptiveEmbedding,
+        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+        TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+        TF_MODEL_FOR_MASKED_LM_MAPPING,
+        TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
+        TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
+        TF_MODEL_FOR_PRETRAINING_MAPPING,
+        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+        TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
+        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
         BertConfig,
-        BertModel,
-        PreTrainedModel,
-        T5Config,
-        T5ForConditionalGeneration,
+        TFAutoModel,
+        TFAutoModelForSequenceClassification,
+        TFBertModel,
+        TFSharedEmbeddings,
+        tf_top_k_top_p_filtering,
     )
-
-if is_flax_available():
-    import jax.numpy as jnp
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
+    from transformers.generation_tf_utils import (
+        TFBeamSampleDecoderOnlyOutput,
+        TFBeamSampleEncoderDecoderOutput,
+        TFBeamSearchDecoderOnlyOutput,
+        TFBeamSearchEncoderDecoderOutput,
+        TFGreedySearchDecoderOnlyOutput,
+        TFGreedySearchEncoderDecoderOutput,
+        TFSampleDecoderOnlyOutput,
+        TFSampleEncoderDecoderOutput,
     )
 
-if is_torch_fx_available():
-    from transformers.utils.fx import symbolic_trace
+    if _tf_gpu_memory_limit is not None:
+        gpus = tf.config.list_physical_devices("GPU")
+        for gpu in gpus:
+            # Restrict TensorFlow to only allocate x GB of memory on the GPUs
+            try:
+                tf.config.set_logical_device_configuration(
+                    gpu,
+                    [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)],
+                )
+                logical_gpus = tf.config.list_logical_devices("GPU")
+                print("Logical GPUs", logical_gpus)
+            except RuntimeError as e:
+                # Virtual devices must be set before GPUs have been initialized
+                print(e)
 
 
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
     for key in configs_no_init.__dict__.keys():
-        if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
-            setattr(configs_no_init, key, 1e-10)
+        if "_range" in key or "_std" in key:
+            setattr(configs_no_init, key, 0.0)
     return configs_no_init
 
 
-TINY_T5 = "patrickvonplaten/t5-tiny-random"
-
-
-@require_torch
-class ModelTesterMixin:
+@require_tf
+class TFModelTesterMixin:
 
     model_tester = None
     all_model_classes = ()
     all_generative_model_classes = ()
-    fx_compatible = False
-    test_torchscript = True
-    test_pruning = True
+    test_mismatched_shapes = True
     test_resize_embeddings = True
-    test_resize_position_embeddings = False
     test_head_masking = True
-    test_mismatched_shapes = True
-    test_missing_keys = True
-    test_model_parallel = False
     is_encoder_decoder = False
 
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
         inputs_dict = copy.deepcopy(inputs_dict)
-        if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+
+        if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
             inputs_dict = {
-                k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
-                if isinstance(v, torch.Tensor) and v.ndim > 1
+                k: tf.tile(
+                    tf.expand_dims(v, 1),
+                    (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1),
+                )
+                if isinstance(v, tf.Tensor) and v.ndim > 0
                 else v
                 for k, v in inputs_dict.items()
             }
 
         if return_labels:
-            if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
-            elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
-                inputs_dict["start_positions"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
-                )
-                inputs_dict["end_positions"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
-                )
+            if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+                inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
+            elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+                inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+                inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
             elif model_class in [
-                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
-                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
-                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
+                *get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
+                *get_values(TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
             ]:
-                inputs_dict["labels"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
-                )
+                inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+            elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING):
+                inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
             elif model_class in [
-                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
-                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
-                *get_values(MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING),
-                *get_values(MODEL_FOR_MASKED_LM_MAPPING),
-                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
+                *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
+                *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING),
+                *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
+                *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
+                *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
+                *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING),
             ]:
-                inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
-                )
-            elif model_class in get_values(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING):
-                num_patches = self.model_tester.image_size // self.model_tester.patch_size
-                inputs_dict["bool_masked_pos"] = torch.zeros(
-                    (self.model_tester.batch_size, num_patches**2), dtype=torch.long, device=torch_device
+                inputs_dict["labels"] = tf.zeros(
+                    (
+                        self.model_tester.batch_size,
+                        self.model_tester.seq_length,
+                    ),
+                    dtype=tf.int32,
                 )
         return inputs_dict
 
+    def test_initialization(self):
+        pass
+
     def test_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            out_2 = outputs[0].cpu().numpy()
-            out_2[np.isnan(out_2)] = 0
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
 
             with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
+                model.save_pretrained(tmpdirname, saved_model=False)
                 model = model_class.from_pretrained(tmpdirname)
-                model.to(torch_device)
-                with torch.no_grad():
-                    after_outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-                # Make sure we don't have nans
-                out_1 = after_outputs[0].cpu().numpy()
-                out_1[np.isnan(out_1)] = 0
-                max_diff = np.amax(np.abs(out_1 - out_2))
-                self.assertLessEqual(max_diff, 1e-5)
-
-    def test_save_load_keys_to_ignore_on_save(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            _keys_to_ignore_on_save = getattr(model, "_keys_to_ignore_on_save", None)
-            if _keys_to_ignore_on_save is None:
-                continue
+                after_outputs = model(self._prepare_for_class(inputs_dict, model_class))
 
-            # check the keys are in the original state_dict
-            for k in _keys_to_ignore_on_save:
-                self.assertIn(k, model.state_dict().keys(), "\n".join(model.state_dict().keys()))
-
-            # check that certain keys didn't get saved with the model
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                output_model_file = os.path.join(tmpdirname, WEIGHTS_NAME)
-                state_dict_saved = torch.load(output_model_file)
-                for k in _keys_to_ignore_on_save:
-                    self.assertNotIn(k, state_dict_saved.keys(), "\n".join(state_dict_saved.keys()))
-
-                # Test we can load the state dict in the model, necessary for the checkpointing API in Trainer.
-                load_result = model.load_state_dict(state_dict_saved, strict=False)
-                self.assertTrue(
-                    len(load_result.missing_keys) == 0
-                    or set(load_result.missing_keys) == set(model._keys_to_ignore_on_save)
-                )
-                self.assertTrue(len(load_result.unexpected_keys) == 0)
-
-    def test_gradient_checkpointing_backward_compatibility(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if not model_class.supports_gradient_checkpointing:
-                continue
-
-            config.gradient_checkpointing = True
-            model = model_class(config)
-            self.assertTrue(model.is_gradient_checkpointing)
-
-    def test_gradient_checkpointing_enable_disable(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if not model_class.supports_gradient_checkpointing:
-                continue
-
-            # at init model should have gradient checkpointing disabled
-            model = model_class(config)
-            self.assertFalse(model.is_gradient_checkpointing)
-
-            # check enable works
-            model.gradient_checkpointing_enable()
-            self.assertTrue(model.is_gradient_checkpointing)
-
-            # check disable works
-            model.gradient_checkpointing_disable()
-            self.assertFalse(model.is_gradient_checkpointing)
-
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-
-    def test_save_load_fast_init_from_base(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = MODEL_MAPPING[config.__class__]
-
-        if isinstance(base_class, tuple):
-            base_class = base_class[0]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            # make a copy of model class to not break future tests
-            # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
-            class CopyClass(model_class):
-                pass
-
-            model_class_copy = CopyClass
-
-            # make sure that all keys are expected for test
-            model_class_copy._keys_to_ignore_on_load_missing = []
-
-            # make init deterministic, but make sure that
-            # non-initialized weights throw errors nevertheless
-            model_class_copy._init_weights = self._mock_init_weights
-
-            model = base_class(config)
-            state_dict = model.state_dict()
-
-            # this will often delete a single weight of a multi-weight module
-            # to test an edge case
-            random_key_to_del = random.choice(list(state_dict.keys()))
-            del state_dict[random_key_to_del]
-
-            # check that certain keys didn't get saved with the model
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
+                self.assert_outputs_same(after_outputs, outputs)
 
-                model_fast_init = model_class_copy.from_pretrained(tmpdirname)
-                model_slow_init = model_class_copy.from_pretrained(tmpdirname, _fast_init=False)
-
-                for key in model_fast_init.state_dict().keys():
-                    max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    def test_save_load_fast_init_to_base(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = MODEL_MAPPING[config.__class__]
-
-        if isinstance(base_class, tuple):
-            base_class = base_class[0]
-
-        for model_class in self.all_model_classes:
-
-            if model_class == base_class:
-                continue
-
-            # make a copy of model class to not break future tests
-            # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
-            class CopyClass(base_class):
-                pass
-
-            base_class_copy = CopyClass
-
-            # make sure that all keys are expected for test
-            base_class_copy._keys_to_ignore_on_load_missing = []
-
-            # make init deterministic, but make sure that
-            # non-initialized weights throw errors nevertheless
-            base_class_copy._init_weights = self._mock_init_weights
-
-            model = model_class(config)
-            state_dict = model.state_dict()
-
-            # this will often delete a single weight of a multi-weight module
-            # to test an edge case
-            random_key_to_del = random.choice(list(state_dict.keys()))
-            del state_dict[random_key_to_del]
-
-            # check that certain keys didn't get saved with the model
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.config.save_pretrained(tmpdirname)
-                torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
-
-                model_fast_init = base_class_copy.from_pretrained(tmpdirname)
-                model_slow_init = base_class_copy.from_pretrained(tmpdirname, _fast_init=False)
-
-                for key in model_fast_init.state_dict().keys():
-                    max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    def test_determinism(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+    def test_save_load_config(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-                second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            out_1 = first.cpu().numpy()
-            out_2 = second.cpu().numpy()
-            out_1 = out_1[~np.isnan(out_1)]
-            out_2 = out_2[~np.isnan(out_2)]
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            model_config = model.get_config()
+            # make sure that returned config is jsonifiable, which is required by keras
+            json.dumps(model_config)
+            new_model = model_class.from_config(model.get_config())
+            # make sure it also accepts a normal config
+            _ = model_class.from_config(model.config)
+            _ = new_model(self._prepare_for_class(inputs_dict, model_class))  # Build model
+            new_model.set_weights(model.get_weights())
+            after_outputs = new_model(self._prepare_for_class(inputs_dict, model_class))
+
+            self.assert_outputs_same(after_outputs, outputs)
 
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            signature = inspect.signature(model.forward)
+            signature = inspect.signature(model.call)
             # signature.parameters is an OrderedDict => so arg_names order is deterministic
             arg_names = [*signature.parameters.keys()]
 
@@ -404,1010 +214,707 @@ def test_forward_signature(self):
                     "decoder_attention_mask",
                 ]
                 expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                    if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
+                    ["head_mask", "decoder_head_mask"] if "head_mask" and "decoder_head_mask" in arg_names else []
+                )
+                # Necessary to handle BART with newly added cross_attn_head_mask
+                expected_arg_names.extend(
+                    ["cross_attn_head_mask", "encoder_outputs"]
+                    if "cross_attn_head_mask" in arg_names
                     else ["encoder_outputs"]
                 )
                 self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
             else:
                 expected_arg_names = ["input_ids"]
                 self.assertListEqual(arg_names[:1], expected_arg_names)
 
-    def test_training(self):
-        if not self.model_tester.is_training:
+    def test_onnx_compliancy(self):
+        if not self.test_onnx:
             return
 
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        INTERNAL_OPS = [
+            "Assert",
+            "AssignVariableOp",
+            "EmptyTensorList",
+            "ReadVariableOp",
+            "ResourceGather",
+            "TruncatedNormal",
+            "VarHandleOp",
+            "VarIsInitializedOp",
+        ]
+        onnx_ops = []
 
-            if model_class in get_values(MODEL_MAPPING):
-                continue
+        with open(os.path.join(".", "utils", "tf_ops", "onnx.json")) as f:
+            onnx_opsets = json.load(f)["opsets"]
 
-            model = model_class(config)
-            model.to(torch_device)
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            loss.backward()
-
-    def test_training_gradient_checkpointing(self):
-        if not self.model_tester.is_training:
-            return
+        for i in range(1, self.onnx_min_opset + 1):
+            onnx_ops.extend(onnx_opsets[str(i)])
 
         for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.use_cache = False
-            config.return_dict = True
+            model_op_names = set()
 
-            if model_class in get_values(MODEL_MAPPING) or not model_class.supports_gradient_checkpointing:
-                continue
-            model = model_class(config)
-            model.to(torch_device)
-            model.gradient_checkpointing_enable()
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            loss.backward()
+            with tf.Graph().as_default() as g:
+                model = model_class(config)
+                model(model.dummy_inputs)
 
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
+                for op in g.get_operations():
+                    model_op_names.add(op.node_def.op)
 
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
+            model_op_names = sorted(model_op_names)
+            incompatible_ops = []
 
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            for op in model_op_names:
+                if op not in onnx_ops and op not in INTERNAL_OPS:
+                    incompatible_ops.append(op)
 
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            self.assertEqual(len(incompatible_ops), 0, incompatible_ops)
 
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-            out_len = len(outputs)
+    @require_tf2onnx
+    @slow
+    def test_onnx_runtime_optimize(self):
+        if not self.test_onnx:
+            return
 
-            if self.is_encoder_decoder:
-                correct_outlen = 5
-
-                # loss is at first position
-                if "labels" in inputs_dict:
-                    correct_outlen += 1  # loss is added to beginning
-                # Question Answering model returns start_logits and end_logits
-                if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
-                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
-                if "past_key_values" in outputs:
-                    correct_outlen += 1  # past_key_values have been returned
-
-                self.assertEqual(out_len, correct_outlen)
-
-                # decoder attentions
-                decoder_attentions = outputs.decoder_attentions
-                self.assertIsInstance(decoder_attentions, (list, tuple))
-                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                )
+        import onnxruntime
+        import tf2onnx
 
-                # cross attentions
-                cross_attentions = outputs.cross_attentions
-                self.assertIsInstance(cross_attentions, (list, tuple))
-                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(cross_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        decoder_seq_length,
-                        encoder_key_length,
-                    ],
-                )
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
+        for model_class in self.all_model_classes:
             model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            model(model.dummy_inputs)
 
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
+            onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset)
 
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            onnxruntime.InferenceSession(onnx_model_proto.SerializeToString())
 
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
+    def test_keras_save_load(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        tf_main_layer_classes = set(
+            module_member
+            for model_class in self.all_model_classes
+            for module in (import_module(model_class.__module__),)
+            for module_member_name in dir(module)
+            if module_member_name.endswith("MainLayer")
+            # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
+            and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
+            for module_member in (getattr(module, module_member_name),)
+            if isinstance(module_member, type)
+            and tf.keras.layers.Layer in module_member.__bases__
+            and getattr(module_member, "_keras_serializable", False)
+        )
+        for main_layer_class in tf_main_layer_classes:
+            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
+            if "T5" in main_layer_class.__name__:
+                # Take the same values than in TFT5ModelTester for this shared layer
+                shared = TFSharedEmbeddings(99, 32, name="shared")
+                config.use_cache = inputs_dict.pop("use_cache", None)
+                main_layer = main_layer_class(config, embed_tokens=shared)
             else:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
+                main_layer = main_layer_class(config)
 
-    @slow
-    def test_torchscript(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        self._create_and_check_torchscript(config, inputs_dict)
+            symbolic_inputs = {
+                name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
+            }
 
-    @slow
-    def test_torchscript_output_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_attentions = True
-        self._create_and_check_torchscript(config, inputs_dict)
+            model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
+            outputs = model(inputs_dict)
 
-    @slow
-    def test_torchscript_output_hidden_state(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        self._create_and_check_torchscript(config, inputs_dict)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                filepath = os.path.join(tmpdirname, "keras_model.h5")
+                model.save(filepath)
+                if "T5" in main_layer_class.__name__:
+                    model = tf.keras.models.load_model(
+                        filepath,
+                        custom_objects={
+                            main_layer_class.__name__: main_layer_class,
+                            "TFSharedEmbeddings": TFSharedEmbeddings,
+                        },
+                    )
+                else:
+                    model = tf.keras.models.load_model(
+                        filepath,
+                        custom_objects={main_layer_class.__name__: main_layer_class},
+                    )
+                assert isinstance(model, tf.keras.Model)
+                after_outputs = model(inputs_dict)
+                self.assert_outputs_same(after_outputs, outputs)
+
+    def assert_outputs_same(self, after_outputs, outputs):
+        # Make sure we don't have nans
+        if isinstance(after_outputs, tf.Tensor):
+            out_1 = after_outputs.numpy()
+        elif isinstance(after_outputs, dict):
+            out_1 = after_outputs[list(after_outputs.keys())[0]].numpy()
+        else:
+            out_1 = after_outputs[0].numpy()
+        out_2 = outputs[0].numpy()
+        self.assertEqual(out_1.shape, out_2.shape)
+        out_1 = out_1[~np.isnan(out_1)]
+        out_2 = out_2[~np.isnan(out_2)]
+        max_diff = np.amax(np.abs(out_1 - out_2))
+        self.assertLessEqual(max_diff, 1e-5)
 
-    def _create_and_check_torchscript(self, config, inputs_dict):
-        if not self.test_torchscript:
-            return
+    @is_pt_tf_cross_test
+    def test_pt_tf_model_equivalence(self):
+        import torch
+
+        import transformers
+
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.torchscript = True
         for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = self._prepare_for_class(inputs_dict, model_class)
+            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
+            pt_model_class = getattr(transformers, pt_model_class_name)
 
-            try:
-                if model.config.is_encoder_decoder:
-                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
-                    input_ids = inputs["input_ids"]
-                    attention_mask = inputs["attention_mask"]
-                    decoder_input_ids = inputs["decoder_input_ids"]
-                    decoder_attention_mask = inputs["decoder_attention_mask"]
-                    traced_model = torch.jit.trace(
-                        model, (input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
-                    )
+            config.output_hidden_states = True
+
+            tf_model = model_class(config)
+            pt_model = pt_model_class(config)
+
+            # Check we can load pt model in tf and vice-versa with model => model functions
+            tf_model = transformers.load_pytorch_model_in_tf2_model(
+                tf_model,
+                pt_model,
+                tf_inputs=self._prepare_for_class(inputs_dict, model_class),
+            )
+            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+            pt_inputs_dict = {}
+            for name, key in self._prepare_for_class(inputs_dict, model_class).items():
+                if type(key) == bool:
+                    pt_inputs_dict[name] = key
+                elif name == "input_values":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                elif name == "pixel_values":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                elif name == "input_features":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
                 else:
-                    input_ids = inputs["input_ids"]
-                    traced_model = torch.jit.trace(model, input_ids)
-            except RuntimeError:
-                self.fail("Couldn't trace module.")
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
 
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs_dict)
+            tfo = tf_model(
+                self._prepare_for_class(inputs_dict, model_class),
+                training=False,
+            )
 
-                try:
-                    torch.jit.save(traced_model, pt_file_name)
-                except Exception:
-                    self.fail("Couldn't save module.")
+            tf_hidden_states = tfo[0].numpy()
+            pt_hidden_states = pto[0].numpy()
 
-                try:
-                    loaded_model = torch.jit.load(pt_file_name)
-                except Exception:
-                    self.fail("Couldn't load module.")
+            tf_nans = np.copy(np.isnan(tf_hidden_states))
+            pt_nans = np.copy(np.isnan(pt_hidden_states))
 
-            model.to(torch_device)
-            model.eval()
+            pt_hidden_states[tf_nans] = 0
+            tf_hidden_states[tf_nans] = 0
+            pt_hidden_states[pt_nans] = 0
+            tf_hidden_states[pt_nans] = 0
 
-            loaded_model.to(torch_device)
-            loaded_model.eval()
+            max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
+            self.assertLessEqual(max_diff, 4e-2)
 
-            model_state_dict = model.state_dict()
-            loaded_model_state_dict = loaded_model.state_dict()
+            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
+                torch.save(pt_model.state_dict(), pt_checkpoint_path)
+                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
 
-            non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
-                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
+                tf_model.save_weights(tf_checkpoint_path)
+                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
 
-            loaded_model_state_dict = {
-                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
-            }
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+            pt_inputs_dict = {}
+            for name, key in self._prepare_for_class(inputs_dict, model_class).items():
+                if type(key) == bool:
+                    key = np.array(key, dtype=bool)
+                    pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long)
+                elif name == "input_values":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                elif name == "pixel_values":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                elif name == "input_features":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                else:
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
+
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs_dict)
+            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class))
+            tfo = tfo[0].numpy()
+            pto = pto[0].numpy()
+            tf_nans = np.copy(np.isnan(tfo))
+            pt_nans = np.copy(np.isnan(pto))
 
-            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+            pto[tf_nans] = 0
+            tfo[tf_nans] = 0
+            pto[pt_nans] = 0
+            tfo[pt_nans] = 0
 
-            model_buffers = list(model.buffers())
-            for non_persistent_buffer in non_persistent_buffers.values():
-                found_buffer = False
-                for i, model_buffer in enumerate(model_buffers):
-                    if torch.equal(non_persistent_buffer, model_buffer):
-                        found_buffer = True
-                        break
+            max_diff = np.amax(np.abs(tfo - pto))
+            self.assertLessEqual(max_diff, 4e-2)
 
-                self.assertTrue(found_buffer)
-                model_buffers.pop(i)
+    def test_compile_tf_model(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        max_input = getattr(self.model_tester, "max_position_embeddings", 512)
+        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
 
-            models_equal = True
-            for layer_name, p1 in model_state_dict.items():
-                if layer_name in loaded_model_state_dict:
-                    p2 = loaded_model_state_dict[layer_name]
-                    if p1.data.ne(p2.data).sum() > 0:
-                        models_equal = False
+        for model_class in self.all_model_classes:
+            if model_class.__name__ in [
+                "TFSpeech2TextModel",
+                "TFSpeech2TextForConditionalGeneration",
+            ]:
+                inputs = {
+                    "decoder_input_ids": tf.keras.Input(
+                        batch_shape=(2, max_input),
+                        name="decoder_input_ids",
+                        dtype="int32",
+                    ),
+                    "input_features": tf.keras.Input(
+                        batch_shape=(
+                            2,
+                            max_input,
+                            self.model_tester.input_feat_per_channel * self.model_tester.input_channels,
+                        ),
+                        name="input_features",
+                        dtype="float32",
+                    ),
+                }
+            elif self.is_encoder_decoder:
+                inputs = {
+                    "decoder_input_ids": tf.keras.Input(
+                        batch_shape=(2, max_input),
+                        name="decoder_input_ids",
+                        dtype="int32",
+                    ),
+                    "input_ids": tf.keras.Input(
+                        batch_shape=(2, max_input),
+                        name="input_ids",
+                        dtype="int32",
+                    ),
+                }
+            # TODO: A better way to handle vision models
+            elif model_class.__name__ in [
+                "TFViTModel",
+                "TFViTForImageClassification",
+                "TFCLIPVisionModel",
+            ]:
+                inputs = tf.keras.Input(
+                    batch_shape=(
+                        3,
+                        self.model_tester.num_channels,
+                        self.model_tester.image_size,
+                        self.model_tester.image_size,
+                    ),
+                    name="pixel_values",
+                    dtype="float32",
+                )
+            elif model_class.__name__ in ["TFCLIPModel"]:
+                inputs = {
+                    "input_ids": tf.keras.Input(
+                        batch_shape=(3, max_input),
+                        name="input_ids",
+                        dtype="int32",
+                    ),
+                    "pixel_values": tf.keras.Input(
+                        batch_shape=(
+                            3,
+                            self.model_tester.vision_model_tester.num_channels,
+                            self.model_tester.vision_model_tester.image_size,
+                            self.model_tester.vision_model_tester.image_size,
+                        ),
+                        name="pixel_values",
+                        dtype="float32",
+                    ),
+                }
+            elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+                inputs = tf.keras.Input(
+                    batch_shape=(4, 2, max_input),
+                    name="input_ids",
+                    dtype="int32",
+                )
+            else:
+                inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
 
-            self.assertTrue(models_equal)
+            # Prepare our model
+            model = model_class(config)
+            model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
+            # Let's load it from the disk to be sure we can use pretrained weights
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname, saved_model=False)
+                model = model_class.from_pretrained(tmpdirname)
 
-    def test_torch_fx(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        self._create_and_check_torch_fx_tracing(config, inputs_dict)
+            outputs_dict = model(inputs)
+            hidden_states = outputs_dict[0]
 
-    def test_torch_fx_output_loss(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        self._create_and_check_torch_fx_tracing(config, inputs_dict, output_loss=True)
+            # Add a dense layer on top to test integration with other keras modules
+            outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
 
-    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
-        if not is_torch_fx_available() or not self.fx_compatible:
-            return
+            # Compile extended model
+            extended_model = tf.keras.Model(inputs=[inputs], outputs=[outputs])
+            extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.return_dict = False
+    def test_keyword_and_dict_args(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
+            model = model_class(config)
+            inputs = self._prepare_for_class(inputs_dict, model_class)
 
-            try:
-                if model.config.is_encoder_decoder:
-                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
-                    labels = inputs.get("labels", None)
-                    input_names = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"]
-                    if labels is not None:
-                        input_names.append("labels")
-                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-
-                    model_output = model(**filtered_inputs)
-
-                    traced_model = symbolic_trace(model, input_names)
-                    traced_output = traced_model(**filtered_inputs)
-                else:
-                    input_names = ["input_ids", "attention_mask", "token_type_ids"]
-                    input_ids = inputs["input_ids"]
-
-                    labels = inputs.get("labels", None)
-                    start_positions = inputs.get("start_positions", None)
-                    end_positions = inputs.get("end_positions", None)
-                    if labels is not None:
-                        input_names.append("labels")
-                    if start_positions is not None:
-                        input_names.append("start_positions")
-                    if end_positions is not None:
-                        input_names.append("end_positions")
-
-                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-                    input_names = filtered_inputs.keys()
-
-                    model_output = model(**filtered_inputs)
-
-                    rank = len(input_ids.shape)
-                    if rank not in [2, 3]:
-                        raise NotImplementedError(
-                            f"symbolic_trace automatic parameters inference not implemented for input of rank {rank}."
-                        )
+            outputs_dict = model(inputs)
 
-                    traced_model = symbolic_trace(model, input_names)
-                    traced_output = traced_model(**filtered_inputs)
+            inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+            outputs_keywords = model(**inputs_keywords)
+            output_dict = outputs_dict[0].numpy()
+            output_keywords = outputs_keywords[0].numpy()
 
-            except RuntimeError:
-                self.fail("Couldn't trace module.")
+            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
 
-            def flatten_output(output):
-                flatten = []
-                for x in output:
-                    if isinstance(x, (tuple, list)):
-                        flatten += flatten_output(x)
-                    elif not isinstance(x, torch.Tensor):
-                        continue
-                    else:
-                        flatten.append(x)
-                return flatten
+    def test_attention_outputs(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+        decoder_seq_length = getattr(
+            self.model_tester,
+            "decoder_seq_length",
+            self.model_tester.seq_length,
+        )
+        encoder_seq_length = getattr(
+            self.model_tester,
+            "encoder_seq_length",
+            self.model_tester.seq_length,
+        )
+        decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
 
-            model_output = flatten_output(model_output)
-            traced_output = flatten_output(traced_output)
-            num_outputs = len(model_output)
+        def check_decoder_attentions_output(outputs):
+            out_len = len(outputs)
+            self.assertEqual(min(out_len % 2, out_len % 5), 0)  # differentiation due to newly added cross_attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    decoder_seq_length,
+                    decoder_key_length,
+                ],
+            )
 
-            for i in range(num_outputs):
-                self.assertTrue(
-                    torch.allclose(model_output[i], traced_output[i]),
-                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
-                )
+        def check_encoder_attentions_output(outputs):
+            attentions = [
+                t.numpy() for t in (outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions)
+            ]
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    encoder_seq_length,
+                    encoder_key_length,
+                ],
+            )
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["use_cache"] = False
+            config.output_hidden_states = False
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            out_len = len(outputs)
+            self.assertEqual(config.output_hidden_states, False)
+            check_encoder_attentions_output(outputs)
+
+            if self.is_encoder_decoder:
+                model = model_class(config)
+                outputs = model(self._prepare_for_class(inputs_dict, model_class))
+                self.assertEqual(config.output_hidden_states, False)
+                check_decoder_attentions_output(outputs)
+
+            # Check that output attentions can also be changed via the config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            self.assertEqual(config.output_hidden_states, False)
+            check_encoder_attentions_output(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            config.output_hidden_states = True
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+
+            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
+            self.assertEqual(model.config.output_hidden_states, True)
+            check_encoder_attentions_output(outputs)
 
     def test_headmasking(self):
         if not self.test_head_masking:
             return
 
-        global_rng.seed(42)
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        global_rng.seed()
+        random.Random().seed(42)
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        random.Random().seed()
 
         inputs_dict["output_attentions"] = True
         config.output_hidden_states = True
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         for model_class in self.all_model_classes:
             model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
 
             # Prepare head_mask
-            # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
-            head_mask = torch.ones(
-                self.model_tester.num_hidden_layers,
-                self.model_tester.num_attention_heads,
-                device=torch_device,
+            def prepare_layer_head_mask(i, attention_heads, num_hidden_layers):
+                if i == 0:
+                    return tf.concat(
+                        (
+                            tf.zeros(1, dtype=tf.float32),
+                            tf.ones(attention_heads - 1, dtype=tf.float32),
+                        ),
+                        0,
+                    )
+                elif i == num_hidden_layers - 1:
+                    return tf.concat(
+                        (
+                            tf.zeros(attention_heads - 1, dtype=tf.float32),
+                            tf.ones(1, dtype=tf.float32),
+                        ),
+                        0,
+                    )
+                else:
+                    return tf.ones(attention_heads, dtype=tf.float32)
+
+            head_mask = tf.stack(
+                [
+                    prepare_layer_head_mask(i, config.num_attention_heads, config.num_hidden_layers)
+                    for i in range(config.num_hidden_layers)
+                ],
+                0,
             )
-            head_mask[0, 0] = 0
-            head_mask[-1, :-1] = 0
-            head_mask.requires_grad_(requires_grad=True)
+
             inputs = self._prepare_for_class(inputs_dict, model_class).copy()
             inputs["head_mask"] = head_mask
             if model.config.is_encoder_decoder:
-                signature = inspect.signature(model.forward)
+                signature = inspect.signature(model.call)
                 arg_names = [*signature.parameters.keys()]
                 if "decoder_head_mask" in arg_names:  # necessary diferentiation because of T5 model
                     inputs["decoder_head_mask"] = head_mask
                 if "cross_attn_head_mask" in arg_names:
                     inputs["cross_attn_head_mask"] = head_mask
-            outputs = model(**inputs, return_dict=True)
-
-            # Test that we can get a gradient back for importance score computation
-            output = sum(t.sum() for t in outputs[0])
-            output = output.sum()
-            output.backward()
-            multihead_outputs = head_mask.grad
 
-            self.assertIsNotNone(multihead_outputs)
-            self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
+            outputs = model(**inputs, return_dict=True)
 
             def check_attentions_validity(attentions):
                 # Remove Nan
                 for t in attentions:
                     self.assertLess(
-                        torch.sum(torch.isnan(t)), t.numel() / 4
+                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(),
+                        (tf.size(t) / 4).numpy(),
                     )  # Check we don't have more than 25% nans (arbitrary)
+
                 attentions = [
-                    t.masked_fill(torch.isnan(t), 0.0) for t in attentions
+                    tf.where(tf.math.is_nan(t), 0.0, t) for t in attentions
                 ]  # remove them (the test is less complete)
 
-                self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
-                if len(attentions) > 2:  # encoder-decoder models have only 2 layers in each module
-                    self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
-                self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
+                self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0)
+                self.assertNotEqual(
+                    tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(),
+                    0.0,
+                )
+                if len(attentions) > 2:  # encoder-decodere models have only 2 layers in each modules
+                    self.assertNotEqual(
+                        tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(),
+                        0.0,
+                    )
+                self.assertAlmostEqual(
+                    tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(),
+                    0.0,
+                )
+                self.assertNotEqual(
+                    tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(),
+                    0.0,
+                )
 
             if model.config.is_encoder_decoder:
                 check_attentions_validity(outputs.encoder_attentions)
                 check_attentions_validity(outputs.decoder_attentions)
-                check_attentions_validity(outputs.cross_attentions)
+                if "cross_attn_head_mask" in arg_names:
+                    check_attentions_validity(outputs.cross_attentions)
             else:
                 check_attentions_validity(outputs.attentions)
 
-    def test_head_pruning(self):
-        if not self.test_pruning:
-            return
-
-        for model_class in self.all_model_classes:
-            (
-                config,
-                inputs_dict,
-            ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = False
-            model = model_class(config=config)
-            model.to(torch_device)
-            model.eval()
-            heads_to_prune = {
-                0: list(range(1, self.model_tester.num_attention_heads)),
-                -1: [0],
-            }
-            model.prune_heads(heads_to_prune)
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-
-    def test_head_pruning_save_load_from_pretrained(self):
-        if not self.test_pruning:
-            return
-
-        for model_class in self.all_model_classes:
-            (
-                config,
-                inputs_dict,
-            ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = False
-            model = model_class(config=config)
-            model.to(torch_device)
-            model.eval()
-            heads_to_prune = {
-                0: list(range(1, self.model_tester.num_attention_heads)),
-                -1: [0],
-            }
-            model.prune_heads(heads_to_prune)
-
-            with tempfile.TemporaryDirectory() as temp_dir_name:
-                model.save_pretrained(temp_dir_name)
-                model = model_class.from_pretrained(temp_dir_name)
-                model.to(torch_device)
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs[-1]
-            self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-
-    def test_head_pruning_save_load_from_config_init(self):
-        if not self.test_pruning:
-            return
-
-        for model_class in self.all_model_classes:
-            (
-                config,
-                inputs_dict,
-            ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = False
-
-            heads_to_prune = {
-                0: list(range(1, self.model_tester.num_attention_heads)),
-                -1: [0],
-            }
-            config.pruned_heads = heads_to_prune
-
-            model = model_class(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-
-    def test_head_pruning_integration(self):
-        if not self.test_pruning:
-            return
-
-        for model_class in self.all_model_classes:
-            (
-                config,
-                inputs_dict,
-            ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = False
-
-            heads_to_prune = {0: [0], 1: [1, 2]}
-            config.pruned_heads = heads_to_prune
-
-            model = model_class(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
-
-            with tempfile.TemporaryDirectory() as temp_dir_name:
-                model.save_pretrained(temp_dir_name)
-                model = model_class.from_pretrained(temp_dir_name)
-                model.to(torch_device)
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
-
-            heads_to_prune = {0: [0], 2: [1, 2]}
-            model.prune_heads(heads_to_prune)
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
-
-            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
-
     def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
+        def check_hidden_states_output(config, inputs_dict, model_class):
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
             expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
-                    seq_length = seq_length * self.model_tester.chunk_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
+                self.model_tester,
+                "expected_num_hidden_layers",
+                self.model_tester.num_hidden_layers + 1,
             )
 
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
+            if model.config.is_encoder_decoder:
+                encoder_hidden_states = outputs.encoder_hidden_states
+                decoder_hidden_states = outputs.decoder_hidden_states
 
-                self.assertIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(config.output_attentions, False)
+                self.assertEqual(len(encoder_hidden_states), expected_num_layers)
+                self.assertListEqual(
+                    list(encoder_hidden_states[0].shape[-2:]),
+                    [
+                        self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
+                )
+                self.assertEqual(len(decoder_hidden_states), expected_num_layers)
+                self.assertListEqual(
+                    list(decoder_hidden_states[0].shape[-2:]),
+                    [
+                        self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
+                )
+            else:
+                hidden_states = outputs.hidden_states
+                self.assertEqual(config.output_attentions, False)
                 self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
                 self.assertListEqual(
                     list(hidden_states[0].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
+                    [
+                        self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
                 )
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
         for model_class in self.all_model_classes:
             inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
+            check_hidden_states_output(config, inputs_dict, model_class)
 
-            # check that output_hidden_states also work using config
             del inputs_dict["output_hidden_states"]
             config.output_hidden_states = True
+            check_hidden_states_output(config, inputs_dict, model_class)
 
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        inputs = self._prepare_for_class(inputs_dict, model_class)
-
-        outputs = model(**inputs)
-
-        output = outputs[0]
-
-        if config.is_encoder_decoder:
-            # Seq2Seq models
-            encoder_hidden_states = outputs.encoder_hidden_states[0]
-            encoder_attentions = outputs.encoder_attentions[0]
-            encoder_hidden_states.retain_grad()
-            encoder_attentions.retain_grad()
-
-            decoder_hidden_states = outputs.decoder_hidden_states[0]
-            decoder_attentions = outputs.decoder_attentions[0]
-            decoder_hidden_states.retain_grad()
-            decoder_attentions.retain_grad()
-
-            cross_attentions = outputs.cross_attentions[0]
-            cross_attentions.retain_grad()
-
-            output.flatten()[0].backward(retain_graph=True)
-
-            self.assertIsNotNone(encoder_hidden_states.grad)
-            self.assertIsNotNone(encoder_attentions.grad)
-            self.assertIsNotNone(decoder_hidden_states.grad)
-            self.assertIsNotNone(decoder_attentions.grad)
-            self.assertIsNotNone(cross_attentions.grad)
-        else:
-            # Encoder-/Decoder-only models
-            hidden_states = outputs.hidden_states[0]
-            attentions = outputs.attentions[0]
-
-            hidden_states.retain_grad()
-            attentions.retain_grad()
-
-            output.flatten()[0].backward(retain_graph=True)
-
-            self.assertIsNotNone(hidden_states.grad)
-            self.assertIsNotNone(attentions.grad)
-
-    def test_feed_forward_chunking(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            torch.manual_seed(0)
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            hidden_states_no_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            torch.manual_seed(0)
-            config.chunk_size_feed_forward = 1
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            hidden_states_with_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-            self.assertTrue(torch.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3))
-
-    def test_resize_position_vector_embeddings(self):
-        if not self.test_resize_position_embeddings:
-            return
-
+    def test_model_common_attributes(self):
         (
-            original_config,
+            config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
+        text_in_text_out_models = (
+            get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
+            + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
+            + get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING)
+        )
+        speech_in_text_out_models = get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING)
 
         for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
             model = model_class(config)
-            model.to(torch_device)
-
-            if self.model_tester.is_training is False:
-                model.eval()
-
-            max_position_embeddings = config.max_position_embeddings
-
-            # Retrieve the embeddings and clone theme
-            if model.config.is_encoder_decoder:
-                encoder_model_embed, decoder_model_embed = model.get_position_embeddings()
-                encoder_cloned_embeddings = encoder_model_embed.weight.clone()
-                decoder_cloned_embeddings = decoder_model_embed.weight.clone()
-            else:
-                model_embed = model.get_position_embeddings()
-                cloned_embeddings = model_embed.weight.clone()
-
-            # Check that resizing the position embeddings with a larger max_position_embeddings increases
-            # the model's postion embeddings size
-            model.resize_position_embeddings(max_position_embeddings + 10)
-            self.assertEqual(model.config.max_position_embeddings, max_position_embeddings + 10)
-
-            # Check that it actually resizes the embeddings matrix
-            if model.config.is_encoder_decoder:
-                encoder_model_embed, decoder_model_embed = model.get_position_embeddings()
-                self.assertEqual(encoder_model_embed.weight.shape[0], encoder_cloned_embeddings.shape[0] + 10)
-                self.assertEqual(decoder_model_embed.weight.shape[0], decoder_cloned_embeddings.shape[0] + 10)
-            else:
-                model_embed = model.get_position_embeddings()
-                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the position embeddings with a smaller max_position_embeddings decreases
-            # the model's max_position_embeddings
-            model.resize_position_embeddings(max_position_embeddings - 5)
-            self.assertEqual(model.config.max_position_embeddings, max_position_embeddings - 5)
-
-            # Check that it actually resizes the embeddings matrix
-            if model.config.is_encoder_decoder:
-                encoder_model_embed, decoder_model_embed = model.get_position_embeddings()
-                self.assertEqual(encoder_model_embed.weight.shape[0], encoder_cloned_embeddings.shape[0] - 5)
-                self.assertEqual(decoder_model_embed.weight.shape[0], decoder_cloned_embeddings.shape[0] - 5)
-            else:
-                model_embed = model.get_position_embeddings()
-                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 5)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            models_equal = True
-
-            if model.config.is_encoder_decoder:
-                for p1, p2 in zip(encoder_cloned_embeddings, encoder_model_embed.weight):
-                    if p1.data.ne(p2.data).sum() > 0:
-                        models_equal = False
-                for p1, p2 in zip(decoder_cloned_embeddings, decoder_model_embed.weight):
-                    if p1.data.ne(p2.data).sum() > 0:
-                        models_equal = False
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+            if model_class in text_in_text_out_models:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            elif model_class in speech_in_text_out_models:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert name is None
             else:
-                for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-                    if p1.data.ne(p2.data).sum() > 0:
-                        models_equal = False
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
 
-            self.assertTrue(models_equal)
-
-    def test_resize_tokens_embeddings(self):
+    def test_determinism(self):
         (
-            original_config,
+            config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            return
 
         for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
             model = model_class(config)
-            model.to(torch_device)
-
-            if self.model_tester.is_training is False:
-                model.eval()
-
-            model_vocab_size = config.vocab_size
-            # Retrieve the embeddings and clone theme
-            model_embed = model.resize_token_embeddings(model_vocab_size)
-            cloned_embeddings = model_embed.weight.clone()
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
-
-            # make sure that decoder_input_ids are resized as well
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            models_equal = True
-            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-                if p1.data.ne(p2.data).sum() > 0:
-                    models_equal = False
+            first, second = (
+                model(
+                    self._prepare_for_class(inputs_dict, model_class),
+                    training=False,
+                )[0],
+                model(
+                    self._prepare_for_class(inputs_dict, model_class),
+                    training=False,
+                )[0],
+            )
+            out_1 = first.numpy()
+            out_2 = second.numpy()
+            out_1 = out_1[~np.isnan(out_1)]
+            out_2 = out_2[~np.isnan(out_2)]
+            max_diff = np.amax(np.abs(out_1 - out_2))
+            self.assertLessEqual(max_diff, 1e-5)
 
-            self.assertTrue(models_equal)
+    def test_model_outputs_equivalence(self):
 
-    def test_resize_embeddings_untied(self):
         (
-            original_config,
+            config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            return
-
-        original_config.tie_word_embeddings = False
-
-        # if model cannot untied embeddings -> leave test
-        if original_config.tie_word_embeddings:
-            return
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config).to(torch_device)
-
-            # if no output embeddings -> leave test
-            if model.get_output_embeddings() is None:
-                continue
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_vocab_size = config.vocab_size
-            model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Embedding, AdaptiveEmbedding))
-            model.set_input_embeddings(nn.Embedding(10, 10))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model_main_input_name(self):
-        for model_class in self.all_model_classes:
-            model_signature = inspect.signature(getattr(model_class, "forward"))
-            # The main input is the name of the argument after `self`
-            observed_main_input_name = list(model_signature.parameters.keys())[1]
-            self.assertEqual(model_class.main_input_name, observed_main_input_name)
-
-    def test_correct_missing_keys(self):
-        if not self.test_missing_keys:
-            return
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            base_model_prefix = model.base_model_prefix
-
-            if hasattr(model, base_model_prefix):
-                with tempfile.TemporaryDirectory() as temp_dir_name:
-                    model.base_model.save_pretrained(temp_dir_name)
-                    model, loading_info = model_class.from_pretrained(temp_dir_name, output_loading_info=True)
-                    with self.subTest(msg=f"Missing keys for {model.__class__.__name__}"):
-                        self.assertGreater(len(loading_info["missing_keys"]), 0)
-
-    def test_tie_model_weights(self):
-        if not self.test_torchscript:
-            return
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_same_values(layer_1, layer_2):
-            equal = True
-            for p1, p2 in zip(layer_1.weight, layer_2.weight):
-                if p1.data.ne(p2.data).sum() > 0:
-                    equal = False
-            return equal
-
-        for model_class in self.all_model_classes:
-            config.torchscript = True
-            model_not_tied = model_class(config)
-            if model_not_tied.get_output_embeddings() is None:
-                continue
-
-            config_tied = copy.deepcopy(config)
-            config_tied.torchscript = False
-            model_tied = model_class(config_tied)
-            params_tied = list(model_tied.parameters())
-            # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(check_same_values(embeddings, decoding))
-
-            # # Check that after modification, they remain the same.
-            # embeddings.weight.data.div_(2)
-            # # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
-            # self.assertTrue(check_same_values(embeddings, decoding))
-
-            # # Check that after modification, they remain the same.
-            # decoding.weight.data.div_(4)
-            # # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
-            # self.assertTrue(check_same_values(embeddings, decoding))
-
-            # Check that after resize they remain tied.
-            model_tied.resize_token_embeddings(config.vocab_size + 10)
-            params_tied_2 = list(model_tied.parameters())
-            self.assertEqual(len(params_tied_2), len(params_tied))
-
-            # decoding.weight.data.mul_(20)
-            # # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
-            # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
-
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def set_nan_tensor_to_zero(t):
-            t[t != t] = 0
-            return t
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            with torch.no_grad():
-                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
-                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-                def recursive_check(tuple_object, dict_object):
-                    if isinstance(tuple_object, (List, Tuple)):
-                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                            recursive_check(tuple_iterable_value, dict_iterable_value)
-                    elif isinstance(tuple_object, Dict):
-                        for tuple_iterable_value, dict_iterable_value in zip(
-                            tuple_object.values(), dict_object.values()
-                        ):
-                            recursive_check(tuple_iterable_value, dict_iterable_value)
-                    elif tuple_object is None:
-                        return
-                    else:
-                        self.assertTrue(
-                            torch.allclose(
-                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
-                            ),
-                            msg=f"Tuple and dict output are not equal. Difference: {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`: {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}.",
-                        )
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
+            dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+            def recursive_check(tuple_object, dict_object):
+                if isinstance(tuple_object, (List, Tuple)):
+                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                        recursive_check(tuple_iterable_value, dict_iterable_value)
+                elif tuple_object is None:
+                    return
+                else:
+                    self.assertTrue(
+                        all(tf.equal(tuple_object, dict_object)),
+                        msg=f"Tuple and dict output are not equal. Difference: {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}",
+                    )
 
                 recursive_check(tuple_output, dict_output)
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            model.to(torch_device)
-            model.eval()
 
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
@@ -1436,262 +943,22 @@ def recursive_check(tuple_object, dict_object):
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             check_equivalence(
-                model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
+                model,
+                tuple_inputs,
+                dict_inputs,
+                {"output_hidden_states": True, "output_attentions": True},
             )
 
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self):
-        import numpy as np
-        import tensorflow as tf
-
-        import transformers
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
-
-            if not hasattr(transformers, tf_model_class_name):
-                # transformers does not have TF version yet
-                return
-
-            tf_model_class = getattr(transformers, tf_model_class_name)
-
-            config.output_hidden_states = True
-
-            tf_model = tf_model_class(config)
-            pt_model = model_class(config)
-
-            # make sure only tf inputs are forward that actually exist in function args
-            tf_input_keys = set(inspect.signature(tf_model.call).parameters.keys())
-
-            # remove all head masks
-            tf_input_keys.discard("head_mask")
-            tf_input_keys.discard("cross_attn_head_mask")
-            tf_input_keys.discard("decoder_head_mask")
-
-            pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-            pt_inputs = {k: v for k, v in pt_inputs.items() if k in tf_input_keys}
-
-            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-            pt_model.eval()
-            tf_inputs_dict = {}
-            for key, tensor in pt_inputs.items():
-                # skip key that does not exist in tf
-                if type(tensor) == bool:
-                    tf_inputs_dict[key] = tensor
-                elif key == "input_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                elif key == "pixel_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                elif key == "input_features":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                else:
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
-
-            # Check we can load pt model in tf and vice-versa with model => model functions
-            tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict)
-            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model).to(torch_device)
-
-            # need to rename encoder-decoder "inputs" for PyTorch
-            #            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
-            #                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
-
-            with torch.no_grad():
-                pto = pt_model(**pt_inputs)
-            tfo = tf_model(tf_inputs_dict, training=False)
-
-            tf_hidden_states = tfo[0].numpy()
-            pt_hidden_states = pto[0].cpu().numpy()
-
-            tf_nans = np.copy(np.isnan(tf_hidden_states))
-            pt_nans = np.copy(np.isnan(pt_hidden_states))
-
-            pt_hidden_states[tf_nans] = 0
-            tf_hidden_states[tf_nans] = 0
-            pt_hidden_states[pt_nans] = 0
-            tf_hidden_states[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
-            self.assertLessEqual(max_diff, 4e-2)
-
-            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
-                torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
-
-                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
-                tf_model.save_weights(tf_checkpoint_path)
-                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
-                pt_model = pt_model.to(torch_device)
-
-            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-            pt_model.eval()
-            tf_inputs_dict = {}
-            for key, tensor in pt_inputs.items():
-                # skip key that does not exist in tf
-                if type(tensor) == bool:
-                    tensor = np.array(tensor, dtype=bool)
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor, dtype=tf.int32)
-                elif key == "input_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                elif key == "pixel_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                elif key == "input_features":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                else:
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
-
-            # need to rename encoder-decoder "inputs" for PyTorch
-            #            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
-            #                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
-
-            with torch.no_grad():
-                pto = pt_model(**pt_inputs)
-
-            tfo = tf_model(tf_inputs_dict)
-            tfo = tfo[0].numpy()
-            pto = pto[0].cpu().numpy()
-            tf_nans = np.copy(np.isnan(tfo))
-            pt_nans = np.copy(np.isnan(pto))
-
-            pto[tf_nans] = 0
-            tfo[tf_nans] = 0
-            pto[pt_nans] = 0
-            tfo[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(tfo - pto))
-            self.assertLessEqual(max_diff, 4e-2)
-
-    def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
-        diff = np.abs((a - b)).max()
-        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
-
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    return
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2)
-
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # load corresponding PyTorch class
-                pt_model = model_class(config).eval()
-
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    # no flax model exists for this class
-                    return
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
     def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            model.to(torch_device)
-            model.eval()
 
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+            inputs = copy.deepcopy(inputs_dict)
 
             if not self.is_encoder_decoder:
                 input_ids = inputs["input_ids"]
@@ -1702,261 +969,511 @@ def test_inputs_embeds(self):
                 del inputs["input_ids"]
                 inputs.pop("decoder_input_ids", None)
 
-            wte = model.get_input_embeddings()
             if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
+                inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids)
             else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+                inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids)
 
-            with torch.no_grad():
-                model(**inputs)[0]
+            inputs = self._prepare_for_class(inputs, model_class)
+
+            model(inputs)
 
-    @require_torch_multi_gpu
-    def test_multi_gpu_data_parallel_forward(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+    def test_numpy_arrays_inputs(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
-        # some params shouldn't be scattered by nn.DataParallel
-        # so just remove them if they are present.
-        blacklist_non_batched_params = ["head_mask", "decoder_head_mask", "cross_attn_head_mask"]
-        for k in blacklist_non_batched_params:
-            inputs_dict.pop(k, None)
+        def prepare_numpy_arrays(inputs_dict):
+            inputs_np_dict = {}
+            for k, v in inputs_dict.items():
+                if tf.is_tensor(v):
+                    inputs_np_dict[k] = v.numpy()
+                else:
+                    inputs_np_dict[k] = np.array(k)
 
-        # move input tensors to cuda:O
-        for k, v in inputs_dict.items():
-            if torch.is_tensor(v):
-                inputs_dict[k] = v.to(0)
+            return inputs_np_dict
 
         for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            model.to(0)
-            model.eval()
+            model = model_class(config)
 
-            # Wrap model in nn.DataParallel
-            model = nn.DataParallel(model)
-            with torch.no_grad():
-                _ = model(**self._prepare_for_class(inputs_dict, model_class))
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            inputs_np = prepare_numpy_arrays(inputs)
 
-    @require_torch_multi_gpu
-    def test_model_parallelization(self):
-        if not self.test_model_parallel:
-            return
+            output_for_dict_input = model(inputs_np)
+            output_for_kw_input = model(**inputs_np)
+            self.assert_outputs_same(output_for_dict_input, output_for_kw_input)
 
-        # a candidate for testing_utils
-        def get_current_gpu_memory_use():
-            """returns a list of cuda memory allocations per GPU in MBs"""
+    def test_resize_token_embeddings(self):
+        if not self.test_resize_embeddings:
+            return
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
-            per_device_memory = []
-            for id in range(torch.cuda.device_count()):
-                with torch.cuda.device(id):
-                    per_device_memory.append(torch.cuda.memory_allocated() >> 20)
+        def _get_word_embedding_weight(model, embedding_layer):
+            embeds = getattr(embedding_layer, "weight", None)
+            if embeds is not None:
+                return embeds
 
-            return per_device_memory
+            embeds = getattr(embedding_layer, "decoder", None)
+            if embeds is not None:
+                return embeds
 
-        # Needs a large model to see the difference.
-        config = self.model_tester.get_large_model_config()
+            model(model.dummy_inputs)
 
-        for model_class in self.all_parallelizable_model_classes:
-            torch.cuda.empty_cache()
+            embeds = getattr(embedding_layer, "weight", None)
+            if embeds is not None:
+                return embeds
 
-            # 1. single gpu memory load + unload + memory measurements
-            # Retrieve initial memory usage (can easily be ~0.6-1.5GB if cuda-kernels have been preloaded by previous tests)
-            memory_at_start = get_current_gpu_memory_use()
+            embeds = getattr(embedding_layer, "decoder", None)
+            if embeds is not None:
+                return embeds
 
-            # Put model on device 0 and take a memory snapshot
-            model = model_class(config)
-            model.to("cuda:0")
-            memory_after_model_load = get_current_gpu_memory_use()
+            return None
 
-            # The memory use on device 0 should be higher than it was initially.
-            self.assertGreater(memory_after_model_load[0], memory_at_start[0])
+        for model_class in self.all_model_classes:
+            for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
+                # build the embeddings
+                model = model_class(config=config)
+                old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                old_bias = model.get_bias()
+                old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                # reshape the embeddings
+                model.resize_token_embeddings(size)
+                new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                new_bias = model.get_bias()
+                new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+
+                # check that the resized embeddings size matches the desired size.
+                assert_size = size if size is not None else config.vocab_size
+                self.assertEqual(new_input_embeddings.shape[0], assert_size)
+
+                # check that weights remain the same after resizing
+                models_equal = True
+                for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
+                    if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                        models_equal = False
+                self.assertTrue(models_equal)
+
+                if old_bias is not None and new_bias is not None:
+                    for old_weight, new_weight in zip(old_bias.values(), new_bias.values()):
+                        self.assertEqual(new_weight.shape[0], assert_size)
+
+                        models_equal = True
+                        for p1, p2 in zip(old_weight.value(), new_weight.value()):
+                            if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                                models_equal = False
+                        self.assertTrue(models_equal)
+
+                if old_output_embeddings is not None and new_output_embeddings is not None:
+                    self.assertEqual(new_output_embeddings.shape[0], assert_size)
+                    self.assertEqual(
+                        new_output_embeddings.shape[1],
+                        old_output_embeddings.shape[1],
+                    )
 
-            del model
-            gc.collect()
-            torch.cuda.empty_cache()
+                    models_equal = True
+                    for p1, p2 in zip(
+                        old_output_embeddings.value(),
+                        new_output_embeddings.value(),
+                    ):
+                        if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                            models_equal = False
+                    self.assertTrue(models_equal)
 
-            # 2. MP test
-            # it's essential to re-calibrate the usage before the next stage
-            memory_at_start = get_current_gpu_memory_use()
+    def test_lm_head_model_random_no_beam_search_generate(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict.get("input_ids", None)
 
-            # Spread model layers over multiple devices
+        # iterate over all generative models
+        for model_class in self.all_generative_model_classes:
             model = model_class(config)
-            model.parallelize()
-            memory_after_parallelization = get_current_gpu_memory_use()
 
-            # Assert that the memory use on all devices is higher than it was when loaded only on CPU
-            for n in range(torch.cuda.device_count()):
-                self.assertGreater(memory_after_parallelization[n], memory_at_start[n])
-
-            # Assert that the memory use of device 0 is lower than it was when the entire model was loaded on it
-            self.assertLess(memory_after_parallelization[0], memory_after_model_load[0])
-
-            # Assert that the memory use of device 1 is higher than it was when the entire model was loaded
-            # on device 0 and device 1 wasn't used at all
-            self.assertGreater(memory_after_parallelization[1], memory_after_model_load[1])
-
-            del model
-            gc.collect()
-            torch.cuda.empty_cache()
-
-    @require_torch_multi_gpu
-    def test_model_parallel_equal_results(self):
-        if not self.test_model_parallel:
-            return
+            if config.bos_token_id is None:
+                # if bos token id is not defined model needs input_ids
+                with self.assertRaises(AssertionError):
+                    model.generate(do_sample=True, max_length=5)
+                # num_return_sequences = 1
+                self._check_generated_ids(model.generate(input_ids, do_sample=True))
+            elif model_class.__name__ not in ["TFSpeech2TextForConditionalGeneration"]:
+                # Models with non-text inputs won't work here; num_return_sequences = 1
+                self._check_generated_ids(model.generate(do_sample=True, max_length=5))
+
+            with self.assertRaises(ValueError):
+                # generating multiple sequences when no beam search generation
+                # is not allowed as it would always generate the same sequences
+                model.generate(input_ids, do_sample=False, num_return_sequences=2)
+
+            # num_return_sequences > 1, sample
+            self._check_generated_ids(model.generate(input_ids, do_sample=True, num_return_sequences=2))
+
+            # check bad words tokens language generation
+            # create list of 1-seq bad token and list of 2-seq of bad tokens
+            bad_words_ids = [
+                self._generate_random_bad_tokens(1, model),
+                self._generate_random_bad_tokens(2, model),
+            ]
+            output_tokens = model.generate(
+                input_ids,
+                do_sample=True,
+                bad_words_ids=bad_words_ids,
+                num_return_sequences=2,
+            )
+            # only count generated tokens
+            generated_ids = output_tokens[:, input_ids.shape[-1] :]
+            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+    def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict.get("input_ids", None)
+        if input_ids is None:
+            input_ids = inputs_dict.get("input_features", None)
 
-        for model_class in self.all_parallelizable_model_classes:
-            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+        # iterate over all generative models
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+            output_greedy = model.generate(
+                input_ids,
+                do_sample=False,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+            output_sample = model.generate(
+                input_ids,
+                do_sample=True,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
 
-            def cast_to_device(dictionary, device):
-                output = {}
-                for k, v in dictionary.items():
-                    if isinstance(v, torch.Tensor):
-                        output[k] = v.to(device)
-                    else:
-                        output[k] = v
+            if model.config.is_encoder_decoder:
+                self.assertIsInstance(output_greedy, TFGreedySearchEncoderDecoderOutput)
+                self.assertIsInstance(output_sample, TFSampleEncoderDecoderOutput)
+            else:
+                self.assertIsInstance(output_greedy, TFGreedySearchDecoderOnlyOutput)
+                self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput)
 
-                return output
+    def test_lm_head_model_random_beam_search_generate(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict.get("input_ids", None)
 
+        for model_class in self.all_generative_model_classes:
             model = model_class(config)
-            output = model(**cast_to_device(inputs_dict, "cpu"))
-
-            model.parallelize()
-
-            parallel_output = model(**cast_to_device(inputs_dict, "cuda:0"))
 
-            for value, parallel_value in zip(output, parallel_output):
-                if isinstance(value, torch.Tensor):
-                    self.assertTrue(torch.allclose(value, parallel_value.to("cpu"), atol=1e-7))
-                elif isinstance(value, (Tuple, List)):
-                    for value_, parallel_value_ in zip(value, parallel_value):
-                        self.assertTrue(torch.allclose(value_, parallel_value_.to("cpu"), atol=1e-7))
+            if config.bos_token_id is None:
+                # if bos token id is not defined model needs input_ids, num_return_sequences = 1
+                self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2))
+            else:
+                # num_return_sequences = 1
+                self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2))
+
+            with self.assertRaises(AssertionError):
+                # generating more sequences than having beams leads is not possible
+                model.generate(
+                    input_ids,
+                    do_sample=False,
+                    num_return_sequences=3,
+                    num_beams=2,
+                )
 
-    @require_torch_multi_gpu
-    def test_model_parallel_beam_search(self):
-        if not self.test_model_parallel:
-            return
+            # num_return_sequences > 1, sample
+            self._check_generated_ids(
+                model.generate(
+                    input_ids,
+                    do_sample=True,
+                    num_beams=2,
+                    num_return_sequences=2,
+                )
+            )
+            # num_return_sequences > 1, greedy
+            self._check_generated_ids(
+                model.generate(
+                    input_ids,
+                    do_sample=False,
+                    num_beams=2,
+                    num_return_sequences=2,
+                )
+            )
 
-        all_generative_and_parallelizable_model_classes = tuple(
-            set(self.all_generative_model_classes).intersection(self.all_parallelizable_model_classes)
-        )
+            # check bad words tokens language generation
+            # create list of 1-seq bad token and list of 2-seq of bad tokens
+            bad_words_ids = [
+                self._generate_random_bad_tokens(1, model),
+                self._generate_random_bad_tokens(2, model),
+            ]
+            output_tokens = model.generate(
+                input_ids,
+                do_sample=False,
+                bad_words_ids=bad_words_ids,
+                num_beams=2,
+                num_return_sequences=2,
+            )
+            # only count generated tokens
+            generated_ids = output_tokens[:, input_ids.shape[-1] :]
+            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+    def test_lm_head_model_beam_search_generate_dict_outputs(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict.get("input_ids", None)
+        if input_ids is None:
+            input_ids = inputs_dict.get("input_features", None)
 
-        for model_class in all_generative_and_parallelizable_model_classes:
-            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+        # iterate over all generative models
+        for model_class in self.all_generative_model_classes:
             model = model_class(config)
+            output_beam_search = model.generate(
+                input_ids,
+                num_beams=2,
+                do_sample=False,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+            output_beam_sample = model.generate(
+                input_ids,
+                num_beams=2,
+                do_sample=True,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
 
-            def cast_to_device(dictionary, device):
-                output = {}
-                for k, v in dictionary.items():
-                    if isinstance(v, torch.Tensor):
-                        output[k] = v.to(device)
-                    else:
-                        output[k] = v
-
-                return output
-
-            model.parallelize()
-            model.generate(**cast_to_device(inputs_dict, "cuda:0"), num_beams=2)
-
-    def test_problem_types(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        problem_types = [
-            {"title": "multi_label_classification", "num_labels": 2, "dtype": torch.float},
-            {"title": "single_label_classification", "num_labels": 1, "dtype": torch.long},
-            {"title": "regression", "num_labels": 1, "dtype": torch.float},
-        ]
+            if model.config.is_encoder_decoder:
+                self.assertIsInstance(output_beam_search, TFBeamSearchEncoderDecoderOutput)
+                self.assertIsInstance(output_beam_sample, TFBeamSampleEncoderDecoderOutput)
+            else:
+                self.assertIsInstance(output_beam_search, TFBeamSearchDecoderOnlyOutput)
+                self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput)
 
+    def test_loss_computation(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
-            if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
-                continue
-
-            for problem_type in problem_types:
-                with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
+            model = model_class(config)
+            if getattr(model, "hf_compute_loss", None):
+                # The number of elements in the loss should be the same as the number of elements in the label
+                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+                added_label = prepared_for_class[
+                    sorted(
+                        list(prepared_for_class.keys() - inputs_dict.keys()),
+                        reverse=True,
+                    )[0]
+                ]
+                loss_size = tf.size(added_label)
 
-                    config.problem_type = problem_type["title"]
-                    config.num_labels = problem_type["num_labels"]
+                if model.__class__ in get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING):
+                    # if loss is causal lm loss, labels are shift, so that one label per batch
+                    # is cut
+                    loss_size = loss_size - self.model_tester.batch_size
 
-                    model = model_class(config)
-                    model.to(torch_device)
-                    model.train()
+                # Test that model correctly compute the loss with kwargs
+                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+                possible_input_names = {
+                    "input_ids",
+                    "pixel_values",
+                    "input_features",
+                }
+                input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
+                model_input = prepared_for_class.pop(input_name)
+
+                loss = model(model_input, **prepared_for_class)[0]
+                self.assertEqual(loss.shape, [loss_size])
+
+                # Test that model correctly compute the loss with a dict
+                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+                loss = model(prepared_for_class)[0]
+                self.assertEqual(loss.shape, [loss_size])
+
+                # Test that model correctly compute the loss with a tuple
+                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+
+                # Get keys that were added with the _prepare_for_class function
+                label_keys = prepared_for_class.keys() - inputs_dict.keys()
+                signature = inspect.signature(model.call).parameters
+                signature_names = list(signature.keys())
+
+                # Create a dictionary holding the location of the tensors in the tuple
+                tuple_index_mapping = {0: input_name}
+                for label_key in label_keys:
+                    label_key_index = signature_names.index(label_key)
+                    tuple_index_mapping[label_key_index] = label_key
+                sorted_tuple_index_mapping = sorted(tuple_index_mapping.items())
+                # Initialize a list with their default values, update the values and convert to a tuple
+                list_input = []
+
+                for name in signature_names:
+                    if name != "kwargs":
+                        list_input.append(signature[name].default)
+
+                for index, value in sorted_tuple_index_mapping:
+                    list_input[index] = prepared_for_class[value]
+
+                tuple_input = tuple(list_input)
+
+                # Send to model
+                loss = model(tuple_input[:-1])[0]
+
+                self.assertEqual(loss.shape, [loss_size])
+
+    def test_generate_with_headmasking(self):
+        attention_names = [
+            "encoder_attentions",
+            "decoder_attentions",
+            "cross_attentions",
+        ]
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
-                    inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
 
-                    if problem_type["num_labels"] > 1:
-                        inputs["labels"] = inputs["labels"].unsqueeze(1).repeat(1, problem_type["num_labels"])
+            # We want to test only encoder-decoder models
+            if not config.is_encoder_decoder:
+                continue
 
-                    inputs["labels"] = inputs["labels"].to(problem_type["dtype"])
+            head_masking = {
+                "head_mask": tf.zeros((config.encoder_layers, config.encoder_attention_heads)),
+                "decoder_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
+                "cross_attn_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
+            }
 
-                    # This tests that we do not trigger the warning form PyTorch "Using a target size that is different
-                    # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure
-                    # they have the same size." which is a symptom something in wrong for the regression problem.
-                    # See https://github.com/huggingface/transformers/issues/11780
-                    with warnings.catch_warnings(record=True) as warning_list:
-                        loss = model(**inputs).loss
-                    for w in warning_list:
-                        if "Using a target size that is different to the input size" in str(w.message):
-                            raise ValueError(
-                                f"Something is going wrong in the regression problem: intercepted {w.message}"
-                            )
+            signature = inspect.signature(model.call)
+            if set(head_masking.keys()) < set([*signature.parameters.keys()]):
+                continue
 
-                    loss.backward()
+            for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
+                out = model.generate(
+                    inputs_dict["input_ids"],
+                    num_beams=1,
+                    max_length=inputs_dict["input_ids"] + 5,
+                    output_attentions=True,
+                    return_dict_in_generate=True,
+                    **{name: mask},
+                )
+                # We check the state of decoder_attentions and cross_attentions just from the last step
+                attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
+                self.assertEqual(sum([tf.reduce_sum(w).numpy() for w in attn_weights]), 0.0)
 
     def test_load_with_mismatched_shapes(self):
         if not self.test_mismatched_shapes:
             return
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
-            if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
+            if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
                 continue
 
             with self.subTest(msg=f"Testing {model_class}"):
                 with tempfile.TemporaryDirectory() as tmp_dir:
                     model = model_class(config)
+                    inputs = self._prepare_for_class(inputs_dict, model_class)
+                    _ = model(**inputs)
                     model.save_pretrained(tmp_dir)
 
                     # Fails when we don't set ignore_mismatched_sizes=True
-                    with self.assertRaises(RuntimeError):
-                        new_model = AutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42)
-                    with self.assertRaises(RuntimeError):
-                        new_model_without_prefix = AutoModel.from_pretrained(tmp_dir, vocab_size=10)
-
-                    logger = logging.get_logger("transformers.modeling_utils")
+                    with self.assertRaises(ValueError):
+                        new_model = TFAutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42)
+                    with self.assertRaises(ValueError):
+                        new_model_without_prefix = TFAutoModel.from_pretrained(tmp_dir, vocab_size=10)
 
+                    logger = logging.get_logger("transformers.modeling_tf_utils")
                     with CaptureLogger(logger) as cl:
-                        new_model = AutoModelForSequenceClassification.from_pretrained(
+                        new_model = TFAutoModelForSequenceClassification.from_pretrained(
                             tmp_dir, num_labels=42, ignore_mismatched_sizes=True
                         )
                     self.assertIn("the shapes did not match", cl.out)
-                    new_model.to(torch_device)
-                    inputs = self._prepare_for_class(inputs_dict, model_class)
+
                     logits = new_model(**inputs).logits
                     self.assertEqual(logits.shape[1], 42)
 
                     with CaptureLogger(logger) as cl:
-                        new_model_without_prefix = AutoModel.from_pretrained(
+                        new_model_without_prefix = TFAutoModel.from_pretrained(
                             tmp_dir, vocab_size=10, ignore_mismatched_sizes=True
                         )
                     self.assertIn("the shapes did not match", cl.out)
+
+                    # Although Tf models always have a prefix pointing to `MainLayer`,
+                    # we still add this "without prefix" test to keep a consistency between tf and pt tests.
                     input_ids = ids_tensor((2, 8), 10)
-                    new_model_without_prefix.to(torch_device)
                     if self.is_encoder_decoder:
                         new_model_without_prefix(input_ids, decoder_input_ids=input_ids)
                     else:
                         new_model_without_prefix(input_ids)
 
+    def test_model_main_input_name(self):
+        for model_class in self.all_model_classes:
+            model_signature = inspect.signature(getattr(model_class, "call"))
+            # The main input is the name of the argument after `self`
+            observed_main_input_name = list(model_signature.parameters.keys())[1]
+            self.assertEqual(model_class.main_input_name, observed_main_input_name)
 
-global_rng = random.Random()
-
-
-def ids_tensor(shape, vocab_size, rng=None, name=None):
-    #  Creates a random int32 tensor of the shape within the vocab size
+    def _generate_random_bad_tokens(self, num_bad_tokens, model):
+        # special tokens cannot be bad tokens
+        special_tokens = []
+        if model.config.bos_token_id is not None:
+            special_tokens.append(model.config.bos_token_id)
+        if model.config.pad_token_id is not None:
+            special_tokens.append(model.config.pad_token_id)
+        if model.config.eos_token_id is not None:
+            special_tokens.append(model.config.eos_token_id)
+
+        # create random bad tokens that are not special tokens
+        bad_tokens = []
+        while len(bad_tokens) < num_bad_tokens:
+            token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0]
+            if token not in special_tokens:
+                bad_tokens.append(token)
+        return bad_tokens
+
+    def _check_generated_ids(self, output_ids):
+        for token_id in output_ids[0].numpy().tolist():
+            self.assertGreaterEqual(token_id, 0)
+            self.assertLess(token_id, self.model_tester.vocab_size)
+
+    def _check_match_tokens(self, generated_ids, bad_words_ids):
+        # for all bad word tokens
+        for bad_word_ids in bad_words_ids:
+            # for all slices in batch
+            for generated_ids_slice in generated_ids:
+                # for all word idx
+                for i in range(len(bad_word_ids), len(generated_ids_slice)):
+                    # if tokens match
+                    if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids:
+                        return True
+        return False
+
+
+def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
+    """Creates a random int32 tensor of the shape within the vocab size."""
     if rng is None:
-        rng = global_rng
+        rng = random.Random()
 
     total_dims = 1
     for dim in shape:
@@ -1966,20 +1483,28 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
     for _ in range(total_dims):
         values.append(rng.randint(0, vocab_size - 1))
 
-    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
+    output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32)
+
+    return output
 
 
-def random_attention_mask(shape, rng=None, name=None):
-    attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None)
+def random_attention_mask(shape, rng=None, name=None, dtype=None):
+    attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype)
     # make sure that at least one token is attended to for each batch
-    attn_mask[:, -1] = 1
+    attn_mask = tf.concat(
+        [
+            tf.constant(value=1, shape=(shape[0], 1), dtype=dtype),
+            attn_mask[:, 1:],
+        ],
+        axis=1,
+    )
     return attn_mask
 
 
-def floats_tensor(shape, scale=1.0, rng=None, name=None):
+def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
     """Creates a random float32 tensor"""
     if rng is None:
-        rng = global_rng
+        rng = random.Random()
 
     total_dims = 1
     for dim in shape:
@@ -1989,128 +1514,134 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
     for _ in range(total_dims):
         values.append(rng.random() * scale)
 
-    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
-
-
-@require_torch
-class ModelUtilsTest(TestCasePlus):
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            config = BertConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, PretrainedConfig)
-
-            model = BertModel.from_pretrained(model_name)
-            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, PreTrainedModel)
-
-            self.assertEqual(len(loading_info["missing_keys"]), 0)
-            self.assertEqual(len(loading_info["unexpected_keys"]), 8)
-            self.assertEqual(len(loading_info["mismatched_keys"]), 0)
-            self.assertEqual(len(loading_info["error_msgs"]), 0)
-
-            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+    return tf.reshape(
+        tf.constant(values, dtype=dtype if dtype is not None else tf.float32),
+        shape=shape,
+    )
 
-            # Not sure this is the intended behavior. TODO fix Lysandre & Thom
-            config.name_or_path = model_name
 
-            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            self.assertEqual(model.config.output_hidden_states, True)
-            self.assertEqual(model.config, config)
-
-    def test_model_from_pretrained_with_different_pretrained_model_name(self):
-        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
-        self.assertIsNotNone(model)
-
-        logger = logging.get_logger("transformers.configuration_utils")
-        with CaptureLogger(logger) as cl:
-            BertModel.from_pretrained(TINY_T5)
-        self.assertTrue("You are using a model of type t5 to instantiate a model of type bert" in cl.out)
-
-    @require_torch
-    def test_model_from_config_torch_dtype(self):
-        # test that the model can be instantiated with dtype of user's choice - as long as it's a
-        # float dtype. To make it happen config.torch_dtype needs to be set before instantiating the
-        # model from the config object.
-
-        config = T5Config.from_pretrained(TINY_T5)
-        model = AutoModel.from_config(config)
-        # XXX: isn't supported
-        # model = T5ForConditionalGeneration.from_config(config)
-        self.assertEqual(model.dtype, torch.float32)
-
-        model = AutoModel.from_config(config, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # torch.set_default_dtype() supports only float dtypes, so will fail with non-float type
-        with self.assertRaises(ValueError):
-            model = AutoModel.from_config(config, torch_dtype=torch.int64)
-
-    @require_torch
-    def test_model_from_pretrained_torch_dtype(self):
-        # test that the model can be instantiated with dtype of either
-        # 1. explicit from_pretrained's torch_dtype argument
-        # 2. via autodiscovery by looking at model weights (torch_dtype="auto")
-        # so if a model.half() was saved, we want it to be instantiated as such.
-        #
-        # test an explicit model class, but also AutoModel separately as the latter goes through a different code path
-        model_path = self.get_auto_remove_tmp_dir()
-
-        # baseline - we know TINY_T5 is fp32 model
-        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
-        self.assertEqual(model.dtype, torch.float32)
-
-        # test the default fp32 save_pretrained => from_pretrained cycle
-        model.save_pretrained(model_path)
-        model = T5ForConditionalGeneration.from_pretrained(model_path)
-        self.assertEqual(model.dtype, torch.float32)
-        # test with auto-detection
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
-        self.assertEqual(model.dtype, torch.float32)
-
-        # test forced loading in fp16 (even though the weights are in fp32)
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # test fp16 save_pretrained, loaded with auto-detection
-        model = model.half()
-        model.save_pretrained(model_path)
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
-        self.assertEqual(model.config.torch_dtype, torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # tests `config.torch_dtype` saving
-        with open(f"{model_path}/config.json") as f:
-            config_dict = json.load(f)
-        self.assertEqual(config_dict["torch_dtype"], "float16")
-
-        # test fp16 save_pretrained, loaded with the explicit fp16
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # test AutoModel separately as it goes through a different path
-        # test auto-detection
-        model = AutoModel.from_pretrained(TINY_T5, torch_dtype="auto")
-        self.assertEqual(model.dtype, torch.float32)
-        # test forcing an explicit dtype
-        model = AutoModel.from_pretrained(TINY_T5, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-    def test_no_super_init_config_and_model(self):
-        config = NoSuperInitConfig(attribute=32)
-        model = NoSuperInitModel(config)
+@require_tf
+class UtilsFunctionsTest(unittest.TestCase):
+
+    # tests whether the top_k_top_p_filtering function behaves as expected
+    def test_top_k_top_p_filtering(self):
+        logits = tf.convert_to_tensor(
+            [
+                [
+                    8.2220991,  # 3rd highest value; idx. 0
+                    -0.5620044,
+                    5.23229752,
+                    4.0386393,
+                    -6.8798378,
+                    -0.54785802,
+                    -3.2012153,
+                    2.92777176,
+                    1.88171953,
+                    7.35341276,  # 5th highest value; idx. 9
+                    8.43207833,  # 2nd highest value; idx. 10
+                    -9.85711836,
+                    -5.96209236,
+                    -1.13039161,
+                    -7.1115294,
+                    -0.8369633,
+                    -5.3186408,
+                    7.06427407,
+                    0.81369344,
+                    -0.82023817,
+                    -5.9179796,
+                    0.58813443,
+                    -6.99778438,
+                    4.71551189,
+                    -0.18771637,
+                    7.44020759,  # 4th highest value; idx. 25
+                    9.38450987,  # 1st highest value; idx. 26
+                    2.12662941,
+                    -9.32562038,
+                    2.35652522,
+                ],  # cummulative prob of 5 highest values <= 0.6
+                [
+                    0.58425518,
+                    4.53139238,
+                    -5.57510464,
+                    -6.28030699,
+                    -7.19529503,
+                    -4.02122551,
+                    1.39337037,
+                    -6.06707057,
+                    1.59480517,
+                    -9.643119,
+                    0.03907799,
+                    0.67231762,
+                    -8.88206726,
+                    6.27115922,  # 4th highest value; idx. 13
+                    2.28520723,
+                    4.82767506,
+                    4.30421368,
+                    8.8275313,  # 2nd highest value; idx. 17
+                    5.44029958,  # 5th highest value; idx. 18
+                    -4.4735794,
+                    7.38579536,  # 3rd highest value; idx. 20
+                    -2.91051663,
+                    2.61946077,
+                    -2.5674762,
+                    -9.48959302,
+                    -4.02922645,
+                    -1.35416918,
+                    9.67702323,  # 1st highest value; idx. 27
+                    -5.89478553,
+                    1.85370467,
+                ],  # cummulative prob of 5 highest values <= 0.6
+            ],
+            dtype=tf.float32,
+        )
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir)
+        non_inf_expected_idx = tf.convert_to_tensor(
+            [
+                [0, 0],
+                [0, 9],
+                [0, 10],
+                [0, 25],
+                [0, 26],
+                [1, 13],
+                [1, 17],
+                [1, 18],
+                [1, 20],
+                [1, 27],
+            ],
+            dtype=tf.int32,
+        )  # expected non filtered idx as noted above
+
+        non_inf_expected_output = tf.convert_to_tensor(
+            [
+                8.222099,
+                7.3534126,
+                8.432078,
+                7.4402075,
+                9.38451,
+                6.271159,
+                8.827531,
+                5.4402995,
+                7.3857956,
+                9.677023,
+            ],
+            dtype=tf.float32,
+        )  # expected non filtered values as noted above
+
+        output = tf_top_k_top_p_filtering(logits, top_k=10, top_p=0.6, min_tokens_to_keep=4)
+
+        non_inf_output = output[output != -float("inf")]
+        non_inf_idx = tf.cast(
+            tf.where(tf.not_equal(output, tf.constant(-float("inf"), dtype=tf.float32))),
+            dtype=tf.int32,
+        )
 
-            model = NoSuperInitModel.from_pretrained(tmp_dir)
+        tf.debugging.assert_near(non_inf_output, non_inf_expected_output, rtol=1e-12)
+        tf.debugging.assert_equal(non_inf_idx, non_inf_expected_idx)
 
 
-@require_torch
+@require_tf
 @is_staging_test
-class ModelPushToHubTester(unittest.TestCase):
+class TFModelPushToHubTester(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls._token = login(username=USER, password=PASS)
@@ -2118,78 +1649,77 @@ def setUpClass(cls):
     @classmethod
     def tearDownClass(cls):
         try:
-            delete_repo(token=cls._token, name="test-model")
+            delete_repo(token=cls._token, name="test-model-tf")
         except HTTPError:
             pass
 
         try:
-            delete_repo(token=cls._token, name="test-model-org", organization="valid_org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, name="test-dynamic-model")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, name="test-dynamic-model-config")
+            delete_repo(
+                token=cls._token,
+                name="test-model-tf-org",
+                organization="valid_org",
+            )
         except HTTPError:
             pass
 
     def test_push_to_hub(self):
         config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
         )
-        model = BertModel(config)
+        model = TFBertModel(config)
+        # Make sure model is properly initialized
+        _ = model(model.dummy_inputs)
         with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, "test-model"), push_to_hub=True, use_auth_token=self._token)
+            model.save_pretrained(
+                os.path.join(tmp_dir, "test-model-tf"),
+                push_to_hub=True,
+                use_auth_token=self._token,
+            )
+
+            new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
+            models_equal = True
+            for p1, p2 in zip(model.weights, new_model.weights):
+                if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                    models_equal = False
+            self.assertTrue(models_equal)
 
-            new_model = BertModel.from_pretrained(f"{USER}/test-model")
-            for p1, p2 in zip(model.parameters(), new_model.parameters()):
-                self.assertTrue(torch.equal(p1, p2))
+    def test_push_to_hub_with_model_card(self):
+        config = BertConfig(
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+        )
+        model = TFBertModel(config)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.push_to_hub(os.path.join(tmp_dir, "test-model-tf"))
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "test-model-card-tf", "README.md")))
 
     def test_push_to_hub_in_organization(self):
         config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
         )
-        model = BertModel(config)
+        model = TFBertModel(config)
         with tempfile.TemporaryDirectory() as tmp_dir:
             model.save_pretrained(
-                os.path.join(tmp_dir, "test-model-org"),
+                os.path.join(tmp_dir, "test-model-tf-org"),
                 push_to_hub=True,
                 use_auth_token=self._token,
                 organization="valid_org",
             )
 
-            new_model = BertModel.from_pretrained("valid_org/test-model-org")
-            for p1, p2 in zip(model.parameters(), new_model.parameters()):
-                self.assertTrue(torch.equal(p1, p2))
-
-    def test_push_to_hub_dynamic_model(self):
-        CustomConfig.register_for_auto_class()
-        CustomModel.register_for_auto_class()
-
-        config = CustomConfig(hidden_size=32)
-        model = CustomModel(config)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-model", use_auth_token=self._token)
-            model.save_pretrained(tmp_dir)
-            # checks
-            self.assertDictEqual(
-                config.auto_map,
-                {"AutoConfig": "custom_configuration.CustomConfig", "AutoModel": "custom_modeling.CustomModel"},
-            )
-
-            repo.push_to_hub()
-
-        new_model = AutoModel.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
-        # Can't make an isinstance check because the new_model is from the CustomModel class of a dynamic module
-        self.assertEqual(new_model.__class__.__name__, "CustomModel")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-        config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
-        new_model = AutoModel.from_config(config, trust_remote_code=True)
-        self.assertEqual(new_model.__class__.__name__, "CustomModel")
+            new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
+            models_equal = True
+            for p1, p2 in zip(model.weights, new_model.weights):
+                if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                    models_equal = False
+            self.assertTrue(models_equal)

From ad5d7e0153ab9895205a238f0ee03f9c2259833d Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 24 Feb 2022 15:43:01 +0530
Subject: [PATCH 56/65] chore: revert to the previous
 tests/test_modeling_common.py.

---
 tests/test_modeling_common.py | 3172 ++++++++++++++++++++-------------
 1 file changed, 1961 insertions(+), 1211 deletions(-)

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index bf707b762c394..348ffcd2c4490 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -13,152 +13,183 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import copy
+import gc
 import inspect
 import json
 import os
+import os.path
 import random
+import sys
 import tempfile
 import unittest
-from importlib import import_module
-from typing import List, Tuple
+import warnings
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import numpy as np
 
-from huggingface_hub import delete_repo, login
+import transformers
+from huggingface_hub import Repository, delete_repo, login
 from requests.exceptions import HTTPError
-from transformers import is_tf_available
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForSequenceClassification,
+    PretrainedConfig,
+    is_torch_available,
+    logging,
+)
+from transformers.file_utils import WEIGHTS_NAME, is_flax_available, is_torch_fx_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import tooslow  # noqa: F401
 from transformers.testing_utils import (
     PASS,
     USER,
     CaptureLogger,
-    _tf_gpu_memory_limit,
+    TestCasePlus,
+    is_pt_flax_cross_test,
     is_pt_tf_cross_test,
     is_staging_test,
-    require_tf,
-    require_tf2onnx,
+    require_torch,
+    require_torch_multi_gpu,
     slow,
+    torch_device,
 )
-from transformers.utils import logging
 
 
-if is_tf_available():
-    import numpy as np
-    import tensorflow as tf
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_configuration import CustomConfig, NoSuperInitConfig  # noqa E402
 
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from test_module.custom_modeling import CustomModel, NoSuperInitModel
     from transformers import (
-        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_MASKED_LM_MAPPING,
-        TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
-        TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
-        TF_MODEL_FOR_PRETRAINING_MAPPING,
-        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
-        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
+        MODEL_FOR_CAUSAL_LM_MAPPING,
+        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+        MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
+        MODEL_FOR_MASKED_LM_MAPPING,
+        MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
+        MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
+        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        MODEL_MAPPING,
+        AdaptiveEmbedding,
         BertConfig,
-        TFAutoModel,
-        TFAutoModelForSequenceClassification,
-        TFBertModel,
-        TFSharedEmbeddings,
-        tf_top_k_top_p_filtering,
+        BertModel,
+        PreTrainedModel,
+        T5Config,
+        T5ForConditionalGeneration,
     )
-    from transformers.generation_tf_utils import (
-        TFBeamSampleDecoderOnlyOutput,
-        TFBeamSampleEncoderDecoderOutput,
-        TFBeamSearchDecoderOnlyOutput,
-        TFBeamSearchEncoderDecoderOutput,
-        TFGreedySearchDecoderOnlyOutput,
-        TFGreedySearchEncoderDecoderOutput,
-        TFSampleDecoderOnlyOutput,
-        TFSampleEncoderDecoderOutput,
+
+if is_flax_available():
+    import jax.numpy as jnp
+    from transformers.modeling_flax_pytorch_utils import (
+        convert_pytorch_state_dict_to_flax,
+        load_flax_weights_in_pytorch_model,
     )
 
-    if _tf_gpu_memory_limit is not None:
-        gpus = tf.config.list_physical_devices("GPU")
-        for gpu in gpus:
-            # Restrict TensorFlow to only allocate x GB of memory on the GPUs
-            try:
-                tf.config.set_logical_device_configuration(
-                    gpu,
-                    [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)],
-                )
-                logical_gpus = tf.config.list_logical_devices("GPU")
-                print("Logical GPUs", logical_gpus)
-            except RuntimeError as e:
-                # Virtual devices must be set before GPUs have been initialized
-                print(e)
+if is_torch_fx_available():
+    from transformers.utils.fx import symbolic_trace
 
 
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
     for key in configs_no_init.__dict__.keys():
-        if "_range" in key or "_std" in key:
-            setattr(configs_no_init, key, 0.0)
+        if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
+            setattr(configs_no_init, key, 1e-10)
     return configs_no_init
 
 
-@require_tf
-class TFModelTesterMixin:
+TINY_T5 = "patrickvonplaten/t5-tiny-random"
+
+
+@require_torch
+class ModelTesterMixin:
 
     model_tester = None
     all_model_classes = ()
     all_generative_model_classes = ()
-    test_mismatched_shapes = True
+    fx_compatible = False
+    test_torchscript = True
+    test_pruning = True
     test_resize_embeddings = True
+    test_resize_position_embeddings = False
     test_head_masking = True
+    test_mismatched_shapes = True
+    test_missing_keys = True
+    test_model_parallel = False
     is_encoder_decoder = False
 
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
-
-        if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+        if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
             inputs_dict = {
-                k: tf.tile(
-                    tf.expand_dims(v, 1),
-                    (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1),
-                )
-                if isinstance(v, tf.Tensor) and v.ndim > 0
+                k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
+                if isinstance(v, torch.Tensor) and v.ndim > 1
                 else v
                 for k, v in inputs_dict.items()
             }
 
         if return_labels:
-            if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
-                inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-                inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+            if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+                inputs_dict["labels"] = torch.ones(
+                    self.model_tester.batch_size,
+                    dtype=torch.long,
+                    device=torch_device,
+                )
+            elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+                inputs_dict["start_positions"] = torch.zeros(
+                    self.model_tester.batch_size,
+                    dtype=torch.long,
+                    device=torch_device,
+                )
+                inputs_dict["end_positions"] = torch.zeros(
+                    self.model_tester.batch_size,
+                    dtype=torch.long,
+                    device=torch_device,
+                )
             elif model_class in [
-                *get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
-                *get_values(TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
+                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
             ]:
-                inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING):
-                inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+                inputs_dict["labels"] = torch.zeros(
+                    self.model_tester.batch_size,
+                    dtype=torch.long,
+                    device=torch_device,
+                )
             elif model_class in [
-                *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
-                *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING),
-                *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
-                *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
-                *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
-                *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING),
+                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
+                *get_values(MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING),
+                *get_values(MODEL_FOR_MASKED_LM_MAPPING),
+                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
             ]:
-                inputs_dict["labels"] = tf.zeros(
+                inputs_dict["labels"] = torch.zeros(
                     (
                         self.model_tester.batch_size,
                         self.model_tester.seq_length,
                     ),
-                    dtype=tf.int32,
+                    dtype=torch.long,
+                    device=torch_device,
+                )
+            elif model_class in get_values(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING):
+                num_patches = self.model_tester.image_size // self.model_tester.patch_size
+                inputs_dict["bool_masked_pos"] = torch.zeros(
+                    (self.model_tester.batch_size, num_patches ** 2),
+                    dtype=torch.long,
+                    device=torch_device,
                 )
         return inputs_dict
 
-    def test_initialization(self):
-        pass
-
     def test_save_load(self):
         (
             config,
@@ -167,16 +198,28 @@ def test_save_load(self):
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            out_2 = outputs[0].cpu().numpy()
+            out_2[np.isnan(out_2)] = 0
 
             with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=False)
+                model.save_pretrained(tmpdirname)
                 model = model_class.from_pretrained(tmpdirname)
-                after_outputs = model(self._prepare_for_class(inputs_dict, model_class))
+                model.to(torch_device)
+                with torch.no_grad():
+                    after_outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
-                self.assert_outputs_same(after_outputs, outputs)
+                # Make sure we don't have nans
+                out_1 = after_outputs[0].cpu().numpy()
+                out_1[np.isnan(out_1)] = 0
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
 
-    def test_save_load_config(self):
+    def test_save_load_keys_to_ignore_on_save(self):
         (
             config,
             inputs_dict,
@@ -184,417 +227,288 @@ def test_save_load_config(self):
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            model_config = model.get_config()
-            # make sure that returned config is jsonifiable, which is required by keras
-            json.dumps(model_config)
-            new_model = model_class.from_config(model.get_config())
-            # make sure it also accepts a normal config
-            _ = model_class.from_config(model.config)
-            _ = new_model(self._prepare_for_class(inputs_dict, model_class))  # Build model
-            new_model.set_weights(model.get_weights())
-            after_outputs = new_model(self._prepare_for_class(inputs_dict, model_class))
-
-            self.assert_outputs_same(after_outputs, outputs)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
+            _keys_to_ignore_on_save = getattr(model, "_keys_to_ignore_on_save", None)
+            if _keys_to_ignore_on_save is None:
+                continue
 
-            if model.config.is_encoder_decoder:
-                expected_arg_names = [
-                    "input_ids",
-                    "attention_mask",
-                    "decoder_input_ids",
-                    "decoder_attention_mask",
-                ]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask"] if "head_mask" and "decoder_head_mask" in arg_names else []
-                )
-                # Necessary to handle BART with newly added cross_attn_head_mask
-                expected_arg_names.extend(
-                    ["cross_attn_head_mask", "encoder_outputs"]
-                    if "cross_attn_head_mask" in arg_names
-                    else ["encoder_outputs"]
+            # check the keys are in the original state_dict
+            for k in _keys_to_ignore_on_save:
+                self.assertIn(
+                    k,
+                    model.state_dict().keys(),
+                    "\n".join(model.state_dict().keys()),
                 )
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
 
-            else:
-                expected_arg_names = ["input_ids"]
-                self.assertListEqual(arg_names[:1], expected_arg_names)
+            # check that certain keys didn't get saved with the model
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                output_model_file = os.path.join(tmpdirname, WEIGHTS_NAME)
+                state_dict_saved = torch.load(output_model_file)
+                for k in _keys_to_ignore_on_save:
+                    self.assertNotIn(
+                        k,
+                        state_dict_saved.keys(),
+                        "\n".join(state_dict_saved.keys()),
+                    )
 
-    def test_onnx_compliancy(self):
-        if not self.test_onnx:
-            return
+                # Test we can load the state dict in the model, necessary for the checkpointing API in Trainer.
+                load_result = model.load_state_dict(state_dict_saved, strict=False)
+                self.assertTrue(
+                    len(load_result.missing_keys) == 0
+                    or set(load_result.missing_keys) == set(model._keys_to_ignore_on_save)
+                )
+                self.assertTrue(len(load_result.unexpected_keys) == 0)
 
+    def test_gradient_checkpointing_backward_compatibility(self):
         (
             config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
-        INTERNAL_OPS = [
-            "Assert",
-            "AssignVariableOp",
-            "EmptyTensorList",
-            "ReadVariableOp",
-            "ResourceGather",
-            "TruncatedNormal",
-            "VarHandleOp",
-            "VarIsInitializedOp",
-        ]
-        onnx_ops = []
-
-        with open(os.path.join(".", "utils", "tf_ops", "onnx.json")) as f:
-            onnx_opsets = json.load(f)["opsets"]
-
-        for i in range(1, self.onnx_min_opset + 1):
-            onnx_ops.extend(onnx_opsets[str(i)])
 
         for model_class in self.all_model_classes:
-            model_op_names = set()
+            if not model_class.supports_gradient_checkpointing:
+                continue
 
-            with tf.Graph().as_default() as g:
-                model = model_class(config)
-                model(model.dummy_inputs)
+            config.gradient_checkpointing = True
+            model = model_class(config)
+            self.assertTrue(model.is_gradient_checkpointing)
 
-                for op in g.get_operations():
-                    model_op_names.add(op.node_def.op)
+    def test_gradient_checkpointing_enable_disable(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
-            model_op_names = sorted(model_op_names)
-            incompatible_ops = []
+        for model_class in self.all_model_classes:
+            if not model_class.supports_gradient_checkpointing:
+                continue
 
-            for op in model_op_names:
-                if op not in onnx_ops and op not in INTERNAL_OPS:
-                    incompatible_ops.append(op)
+            # at init model should have gradient checkpointing disabled
+            model = model_class(config)
+            self.assertFalse(model.is_gradient_checkpointing)
 
-            self.assertEqual(len(incompatible_ops), 0, incompatible_ops)
+            # check enable works
+            model.gradient_checkpointing_enable()
+            self.assertTrue(model.is_gradient_checkpointing)
 
-    @require_tf2onnx
-    @slow
-    def test_onnx_runtime_optimize(self):
-        if not self.test_onnx:
-            return
+            # check disable works
+            model.gradient_checkpointing_disable()
+            self.assertFalse(model.is_gradient_checkpointing)
 
-        import onnxruntime
-        import tf2onnx
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
 
+    def test_save_load_fast_init_from_base(self):
         (
             config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
+        base_class = MODEL_MAPPING[config.__class__]
+
+        if isinstance(base_class, tuple):
+            base_class = base_class[0]
 
         for model_class in self.all_model_classes:
-            model = model_class(config)
-            model(model.dummy_inputs)
+            if model_class == base_class:
+                continue
 
-            onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset)
+            # make a copy of model class to not break future tests
+            # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
+            class CopyClass(model_class):
+                pass
 
-            onnxruntime.InferenceSession(onnx_model_proto.SerializeToString())
+            model_class_copy = CopyClass
 
-    def test_keras_save_load(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+            # make sure that all keys are expected for test
+            model_class_copy._keys_to_ignore_on_load_missing = []
 
-        tf_main_layer_classes = set(
-            module_member
-            for model_class in self.all_model_classes
-            for module in (import_module(model_class.__module__),)
-            for module_member_name in dir(module)
-            if module_member_name.endswith("MainLayer")
-            # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
-            and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
-            for module_member in (getattr(module, module_member_name),)
-            if isinstance(module_member, type)
-            and tf.keras.layers.Layer in module_member.__bases__
-            and getattr(module_member, "_keras_serializable", False)
-        )
-        for main_layer_class in tf_main_layer_classes:
-            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
-            if "T5" in main_layer_class.__name__:
-                # Take the same values than in TFT5ModelTester for this shared layer
-                shared = TFSharedEmbeddings(99, 32, name="shared")
-                config.use_cache = inputs_dict.pop("use_cache", None)
-                main_layer = main_layer_class(config, embed_tokens=shared)
-            else:
-                main_layer = main_layer_class(config)
+            # make init deterministic, but make sure that
+            # non-initialized weights throw errors nevertheless
+            model_class_copy._init_weights = self._mock_init_weights
 
-            symbolic_inputs = {
-                name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
-            }
+            model = base_class(config)
+            state_dict = model.state_dict()
 
-            model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
-            outputs = model(inputs_dict)
+            # this will often delete a single weight of a multi-weight module
+            # to test an edge case
+            random_key_to_del = random.choice(list(state_dict.keys()))
+            del state_dict[random_key_to_del]
 
+            # check that certain keys didn't get saved with the model
             with tempfile.TemporaryDirectory() as tmpdirname:
-                filepath = os.path.join(tmpdirname, "keras_model.h5")
-                model.save(filepath)
-                if "T5" in main_layer_class.__name__:
-                    model = tf.keras.models.load_model(
-                        filepath,
-                        custom_objects={
-                            main_layer_class.__name__: main_layer_class,
-                            "TFSharedEmbeddings": TFSharedEmbeddings,
-                        },
-                    )
-                else:
-                    model = tf.keras.models.load_model(
-                        filepath,
-                        custom_objects={main_layer_class.__name__: main_layer_class},
-                    )
-                assert isinstance(model, tf.keras.Model)
-                after_outputs = model(inputs_dict)
-                self.assert_outputs_same(after_outputs, outputs)
-
-    def assert_outputs_same(self, after_outputs, outputs):
-        # Make sure we don't have nans
-        if isinstance(after_outputs, tf.Tensor):
-            out_1 = after_outputs.numpy()
-        elif isinstance(after_outputs, dict):
-            out_1 = after_outputs[list(after_outputs.keys())[0]].numpy()
-        else:
-            out_1 = after_outputs[0].numpy()
-        out_2 = outputs[0].numpy()
-        self.assertEqual(out_1.shape, out_2.shape)
-        out_1 = out_1[~np.isnan(out_1)]
-        out_2 = out_2[~np.isnan(out_2)]
-        max_diff = np.amax(np.abs(out_1 - out_2))
-        self.assertLessEqual(max_diff, 1e-5)
+                model.save_pretrained(tmpdirname)
+                torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
 
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self):
-        import torch
+                model_fast_init = model_class_copy.from_pretrained(tmpdirname)
+                model_slow_init = model_class_copy.from_pretrained(tmpdirname, _fast_init=False)
 
-        import transformers
+                for key in model_fast_init.state_dict().keys():
+                    max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
+                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
 
+    def test_save_load_fast_init_to_base(self):
         (
             config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
+        base_class = MODEL_MAPPING[config.__class__]
 
-        for model_class in self.all_model_classes:
-            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
-            pt_model_class = getattr(transformers, pt_model_class_name)
-
-            config.output_hidden_states = True
+        if isinstance(base_class, tuple):
+            base_class = base_class[0]
 
-            tf_model = model_class(config)
-            pt_model = pt_model_class(config)
+        for model_class in self.all_model_classes:
 
-            # Check we can load pt model in tf and vice-versa with model => model functions
-            tf_model = transformers.load_pytorch_model_in_tf2_model(
-                tf_model,
-                pt_model,
-                tf_inputs=self._prepare_for_class(inputs_dict, model_class),
-            )
-            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+            if model_class == base_class:
+                continue
 
-            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-            pt_model.eval()
-            pt_inputs_dict = {}
-            for name, key in self._prepare_for_class(inputs_dict, model_class).items():
-                if type(key) == bool:
-                    pt_inputs_dict[name] = key
-                elif name == "input_values":
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-                elif name == "pixel_values":
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-                elif name == "input_features":
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-                else:
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
+            # make a copy of model class to not break future tests
+            # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
+            class CopyClass(base_class):
+                pass
 
-            with torch.no_grad():
-                pto = pt_model(**pt_inputs_dict)
-            tfo = tf_model(
-                self._prepare_for_class(inputs_dict, model_class),
-                training=False,
-            )
+            base_class_copy = CopyClass
 
-            tf_hidden_states = tfo[0].numpy()
-            pt_hidden_states = pto[0].numpy()
+            # make sure that all keys are expected for test
+            base_class_copy._keys_to_ignore_on_load_missing = []
 
-            tf_nans = np.copy(np.isnan(tf_hidden_states))
-            pt_nans = np.copy(np.isnan(pt_hidden_states))
+            # make init deterministic, but make sure that
+            # non-initialized weights throw errors nevertheless
+            base_class_copy._init_weights = self._mock_init_weights
 
-            pt_hidden_states[tf_nans] = 0
-            tf_hidden_states[tf_nans] = 0
-            pt_hidden_states[pt_nans] = 0
-            tf_hidden_states[pt_nans] = 0
+            model = model_class(config)
+            state_dict = model.state_dict()
 
-            max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
-            self.assertLessEqual(max_diff, 4e-2)
+            # this will often delete a single weight of a multi-weight module
+            # to test an edge case
+            random_key_to_del = random.choice(list(state_dict.keys()))
+            del state_dict[random_key_to_del]
 
-            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
+            # check that certain keys didn't get saved with the model
             with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
-                torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
-
-                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
-                tf_model.save_weights(tf_checkpoint_path)
-                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
+                model.config.save_pretrained(tmpdirname)
+                torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
 
-            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-            pt_model.eval()
-            pt_inputs_dict = {}
-            for name, key in self._prepare_for_class(inputs_dict, model_class).items():
-                if type(key) == bool:
-                    key = np.array(key, dtype=bool)
-                    pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long)
-                elif name == "input_values":
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-                elif name == "pixel_values":
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-                elif name == "input_features":
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-                else:
-                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
+                model_fast_init = base_class_copy.from_pretrained(tmpdirname)
+                model_slow_init = base_class_copy.from_pretrained(tmpdirname, _fast_init=False)
 
-            with torch.no_grad():
-                pto = pt_model(**pt_inputs_dict)
-            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class))
-            tfo = tfo[0].numpy()
-            pto = pto[0].numpy()
-            tf_nans = np.copy(np.isnan(tfo))
-            pt_nans = np.copy(np.isnan(pto))
+                for key in model_fast_init.state_dict().keys():
+                    max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
+                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
 
-            pto[tf_nans] = 0
-            tfo[tf_nans] = 0
-            pto[pt_nans] = 0
-            tfo[pt_nans] = 0
+    def test_initialization(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
-            max_diff = np.amax(np.abs(tfo - pto))
-            self.assertLessEqual(max_diff, 4e-2)
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
 
-    def test_compile_tf_model(self):
+    def test_determinism(self):
         (
             config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
-        max_input = getattr(self.model_tester, "max_position_embeddings", 512)
-        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
 
         for model_class in self.all_model_classes:
-            if model_class.__name__ in [
-                "TFSpeech2TextModel",
-                "TFSpeech2TextForConditionalGeneration",
-            ]:
-                inputs = {
-                    "decoder_input_ids": tf.keras.Input(
-                        batch_shape=(2, max_input),
-                        name="decoder_input_ids",
-                        dtype="int32",
-                    ),
-                    "input_features": tf.keras.Input(
-                        batch_shape=(
-                            2,
-                            max_input,
-                            self.model_tester.input_feat_per_channel * self.model_tester.input_channels,
-                        ),
-                        name="input_features",
-                        dtype="float32",
-                    ),
-                }
-            elif self.is_encoder_decoder:
-                inputs = {
-                    "decoder_input_ids": tf.keras.Input(
-                        batch_shape=(2, max_input),
-                        name="decoder_input_ids",
-                        dtype="int32",
-                    ),
-                    "input_ids": tf.keras.Input(
-                        batch_shape=(2, max_input),
-                        name="input_ids",
-                        dtype="int32",
-                    ),
-                }
-            # TODO: A better way to handle vision models
-            elif model_class.__name__ in [
-                "TFViTModel",
-                "TFViTForImageClassification",
-                "TFCLIPVisionModel",
-            ]:
-                inputs = tf.keras.Input(
-                    batch_shape=(
-                        3,
-                        self.model_tester.num_channels,
-                        self.model_tester.image_size,
-                        self.model_tester.image_size,
-                    ),
-                    name="pixel_values",
-                    dtype="float32",
-                )
-            elif model_class.__name__ in ["TFCLIPModel"]:
-                inputs = {
-                    "input_ids": tf.keras.Input(
-                        batch_shape=(3, max_input),
-                        name="input_ids",
-                        dtype="int32",
-                    ),
-                    "pixel_values": tf.keras.Input(
-                        batch_shape=(
-                            3,
-                            self.model_tester.vision_model_tester.num_channels,
-                            self.model_tester.vision_model_tester.image_size,
-                            self.model_tester.vision_model_tester.image_size,
-                        ),
-                        name="pixel_values",
-                        dtype="float32",
-                    ),
-                }
-            elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs = tf.keras.Input(
-                    batch_shape=(4, 2, max_input),
-                    name="input_ids",
-                    dtype="int32",
-                )
-            else:
-                inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
-
-            # Prepare our model
             model = model_class(config)
-            model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
-            # Let's load it from the disk to be sure we can use pretrained weights
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=False)
-                model = model_class.from_pretrained(tmpdirname)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+                second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+
+            out_1 = first.cpu().numpy()
+            out_2 = second.cpu().numpy()
+            out_1 = out_1[~np.isnan(out_1)]
+            out_2 = out_2[~np.isnan(out_2)]
+            max_diff = np.amax(np.abs(out_1 - out_2))
+            self.assertLessEqual(max_diff, 1e-5)
 
-            outputs_dict = model(inputs)
-            hidden_states = outputs_dict[0]
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
-            # Add a dense layer on top to test integration with other keras modules
-            outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
 
-            # Compile extended model
-            extended_model = tf.keras.Model(inputs=[inputs], outputs=[outputs])
-            extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+            if model.config.is_encoder_decoder:
+                expected_arg_names = [
+                    "input_ids",
+                    "attention_mask",
+                    "decoder_input_ids",
+                    "decoder_attention_mask",
+                ]
+                expected_arg_names.extend(
+                    [
+                        "head_mask",
+                        "decoder_head_mask",
+                        "cross_attn_head_mask",
+                        "encoder_outputs",
+                    ]
+                    if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
+                    else ["encoder_outputs"]
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+            else:
+                expected_arg_names = ["input_ids"]
+                self.assertListEqual(arg_names[:1], expected_arg_names)
 
-    def test_keyword_and_dict_args(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
 
         for model_class in self.all_model_classes:
-            model = model_class(config)
-            inputs = self._prepare_for_class(inputs_dict, model_class)
+            (
+                config,
+                inputs_dict,
+            ) = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
 
-            outputs_dict = model(inputs)
+            if model_class in get_values(MODEL_MAPPING):
+                continue
 
-            inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            outputs_keywords = model(**inputs_keywords)
-            output_dict = outputs_dict[0].numpy()
-            output_keywords = outputs_keywords[0].numpy()
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_training_gradient_checkpointing(self):
+        if not self.model_tester.is_training:
+            return
 
-            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
+        for model_class in self.all_model_classes:
+            (
+                config,
+                inputs_dict,
+            ) = self.model_tester.prepare_config_and_inputs_for_common()
+            config.use_cache = False
+            config.return_dict = True
+
+            if model_class in get_values(MODEL_MAPPING) or not model_class.supports_gradient_checkpointing:
+                continue
+            model = model_class(config)
+            model.to(torch_device)
+            model.gradient_checkpointing_enable()
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
 
     def test_attention_outputs(self):
         (
@@ -602,268 +516,967 @@ def test_attention_outputs(self):
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
-        decoder_seq_length = getattr(
-            self.model_tester,
-            "decoder_seq_length",
-            self.model_tester.seq_length,
-        )
-        encoder_seq_length = getattr(
-            self.model_tester,
-            "encoder_seq_length",
-            self.model_tester.seq_length,
-        )
-        decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        def check_decoder_attentions_output(outputs):
-            out_len = len(outputs)
-            self.assertEqual(min(out_len % 2, out_len % 5), 0)  # differentiation due to newly added cross_attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    decoder_seq_length,
-                    decoder_key_length,
-                ],
-            )
 
-        def check_encoder_attentions_output(outputs):
-            attentions = [
-                t.numpy() for t in (outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions)
-            ]
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    encoder_seq_length,
-                    encoder_key_length,
-                ],
-            )
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        chunk_length = getattr(self.model_tester, "chunk_length", None)
+        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
+            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
 
         for model_class in self.all_model_classes:
             inputs_dict["output_attentions"] = True
-            inputs_dict["use_cache"] = False
-            config.output_hidden_states = False
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
             model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            out_len = len(outputs)
-            self.assertEqual(config.output_hidden_states, False)
-            check_encoder_attentions_output(outputs)
-
-            if self.is_encoder_decoder:
-                model = model_class(config)
-                outputs = model(self._prepare_for_class(inputs_dict, model_class))
-                self.assertEqual(config.output_hidden_states, False)
-                check_decoder_attentions_output(outputs)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
 
-            # Check that output attentions can also be changed via the config
+            # check that output_attentions also work using config
             del inputs_dict["output_attentions"]
             config.output_attentions = True
             model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            self.assertEqual(config.output_hidden_states, False)
-            check_encoder_attentions_output(outputs)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(attentions[0].shape[-4:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        encoder_seq_length,
+                        chunk_length,
+                        encoder_key_length,
+                    ],
+                )
+            else:
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        encoder_seq_length,
+                        encoder_key_length,
+                    ],
+                )
+            out_len = len(outputs)
+
+            if self.is_encoder_decoder:
+                correct_outlen = 5
+
+                # loss is at first position
+                if "labels" in inputs_dict:
+                    correct_outlen += 1  # loss is added to beginning
+                # Question Answering model returns start_logits and end_logits
+                if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
+                if "past_key_values" in outputs:
+                    correct_outlen += 1  # past_key_values have been returned
+
+                self.assertEqual(out_len, correct_outlen)
+
+                # decoder attentions
+                decoder_attentions = outputs.decoder_attentions
+                self.assertIsInstance(decoder_attentions, (list, tuple))
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(decoder_attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        decoder_seq_length,
+                        decoder_key_length,
+                    ],
+                )
+
+                # cross attentions
+                cross_attentions = outputs.cross_attentions
+                self.assertIsInstance(cross_attentions, (list, tuple))
+                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(cross_attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        decoder_seq_length,
+                        encoder_key_length,
+                    ],
+                )
 
             # Check attention is always last and order is fine
             inputs_dict["output_attentions"] = True
-            config.output_hidden_states = True
+            inputs_dict["output_hidden_states"] = True
             model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
-            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
-            self.assertEqual(model.config.output_hidden_states, True)
-            check_encoder_attentions_output(outputs)
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
 
-    def test_headmasking(self):
-        if not self.test_head_masking:
-            return
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-4:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        encoder_seq_length,
+                        chunk_length,
+                        encoder_key_length,
+                    ],
+                )
+            else:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        encoder_seq_length,
+                        encoder_key_length,
+                    ],
+                )
 
-        random.Random().seed(42)
+    @slow
+    def test_torchscript(self):
         (
             config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
-        random.Random().seed()
+        self._create_and_check_torchscript(config, inputs_dict)
+
+    @slow
+    def test_torchscript_output_attentions(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_attentions = True
+        self._create_and_check_torchscript(config, inputs_dict)
+
+    @slow
+    def test_torchscript_output_hidden_state(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        self._create_and_check_torchscript(config, inputs_dict)
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
 
-        inputs_dict["output_attentions"] = True
-        config.output_hidden_states = True
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
         for model_class in self.all_model_classes:
             model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
 
-            # Prepare head_mask
-            def prepare_layer_head_mask(i, attention_heads, num_hidden_layers):
-                if i == 0:
-                    return tf.concat(
-                        (
-                            tf.zeros(1, dtype=tf.float32),
-                            tf.ones(attention_heads - 1, dtype=tf.float32),
-                        ),
-                        0,
-                    )
-                elif i == num_hidden_layers - 1:
-                    return tf.concat(
+            try:
+                if model.config.is_encoder_decoder:
+                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
+                    input_ids = inputs["input_ids"]
+                    attention_mask = inputs["attention_mask"]
+                    decoder_input_ids = inputs["decoder_input_ids"]
+                    decoder_attention_mask = inputs["decoder_attention_mask"]
+                    traced_model = torch.jit.trace(
+                        model,
                         (
-                            tf.zeros(attention_heads - 1, dtype=tf.float32),
-                            tf.ones(1, dtype=tf.float32),
+                            input_ids,
+                            attention_mask,
+                            decoder_input_ids,
+                            decoder_attention_mask,
                         ),
-                        0,
                     )
                 else:
-                    return tf.ones(attention_heads, dtype=tf.float32)
-
-            head_mask = tf.stack(
-                [
-                    prepare_layer_head_mask(i, config.num_attention_heads, config.num_hidden_layers)
-                    for i in range(config.num_hidden_layers)
-                ],
-                0,
+                    input_ids = inputs["input_ids"]
+                    traced_model = torch.jit.trace(model, input_ids)
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
+            self.assertEqual(
+                set(model_state_dict.keys()),
+                set(loaded_model_state_dict.keys()),
             )
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                if layer_name in loaded_model_state_dict:
+                    p2 = loaded_model_state_dict[layer_name]
+                    if p1.data.ne(p2.data).sum() > 0:
+                        models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_torch_fx(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        self._create_and_check_torch_fx_tracing(config, inputs_dict)
+
+    def test_torch_fx_output_loss(self):
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        self._create_and_check_torch_fx_tracing(config, inputs_dict, output_loss=True)
+
+    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
+        if not is_torch_fx_available() or not self.fx_compatible:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.return_dict = False
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
+
+            try:
+                if model.config.is_encoder_decoder:
+                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
+                    labels = inputs.get("labels", None)
+                    input_names = [
+                        "input_ids",
+                        "attention_mask",
+                        "decoder_input_ids",
+                        "decoder_attention_mask",
+                    ]
+                    if labels is not None:
+                        input_names.append("labels")
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+
+                    model_output = model(**filtered_inputs)
+
+                    traced_model = symbolic_trace(model, input_names)
+                    traced_output = traced_model(**filtered_inputs)
+                else:
+                    input_names = [
+                        "input_ids",
+                        "attention_mask",
+                        "token_type_ids",
+                    ]
+                    input_ids = inputs["input_ids"]
+
+                    labels = inputs.get("labels", None)
+                    start_positions = inputs.get("start_positions", None)
+                    end_positions = inputs.get("end_positions", None)
+                    if labels is not None:
+                        input_names.append("labels")
+                    if start_positions is not None:
+                        input_names.append("start_positions")
+                    if end_positions is not None:
+                        input_names.append("end_positions")
+
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+                    input_names = filtered_inputs.keys()
+
+                    model_output = model(**filtered_inputs)
+
+                    rank = len(input_ids.shape)
+                    if rank not in [2, 3]:
+                        raise NotImplementedError(
+                            f"symbolic_trace automatic parameters inference not implemented for input of rank {rank}."
+                        )
+
+                    traced_model = symbolic_trace(model, input_names)
+                    traced_output = traced_model(**filtered_inputs)
+
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            def flatten_output(output):
+                flatten = []
+                for x in output:
+                    if isinstance(x, (tuple, list)):
+                        flatten += flatten_output(x)
+                    elif not isinstance(x, torch.Tensor):
+                        continue
+                    else:
+                        flatten.append(x)
+                return flatten
+
+            model_output = flatten_output(model_output)
+            traced_output = flatten_output(traced_output)
+            num_outputs = len(model_output)
+
+            for i in range(num_outputs):
+                self.assertTrue(
+                    torch.allclose(model_output[i], traced_output[i]),
+                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
+                )
+
+    def test_headmasking(self):
+        if not self.test_head_masking:
+            return
+
+        global_rng.seed(42)
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        global_rng.seed()
+
+        inputs_dict["output_attentions"] = True
+        config.output_hidden_states = True
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+
+            # Prepare head_mask
+            # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
+            head_mask = torch.ones(
+                self.model_tester.num_hidden_layers,
+                self.model_tester.num_attention_heads,
+                device=torch_device,
+            )
+            head_mask[0, 0] = 0
+            head_mask[-1, :-1] = 0
+            head_mask.requires_grad_(requires_grad=True)
             inputs = self._prepare_for_class(inputs_dict, model_class).copy()
             inputs["head_mask"] = head_mask
             if model.config.is_encoder_decoder:
-                signature = inspect.signature(model.call)
+                signature = inspect.signature(model.forward)
                 arg_names = [*signature.parameters.keys()]
                 if "decoder_head_mask" in arg_names:  # necessary diferentiation because of T5 model
                     inputs["decoder_head_mask"] = head_mask
                 if "cross_attn_head_mask" in arg_names:
                     inputs["cross_attn_head_mask"] = head_mask
-
             outputs = model(**inputs, return_dict=True)
 
+            # Test that we can get a gradient back for importance score computation
+            output = sum(t.sum() for t in outputs[0])
+            output = output.sum()
+            output.backward()
+            multihead_outputs = head_mask.grad
+
+            self.assertIsNotNone(multihead_outputs)
+            self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
+
             def check_attentions_validity(attentions):
                 # Remove Nan
                 for t in attentions:
                     self.assertLess(
-                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(),
-                        (tf.size(t) / 4).numpy(),
+                        torch.sum(torch.isnan(t)), t.numel() / 4
                     )  # Check we don't have more than 25% nans (arbitrary)
-
                 attentions = [
-                    tf.where(tf.math.is_nan(t), 0.0, t) for t in attentions
+                    t.masked_fill(torch.isnan(t), 0.0) for t in attentions
                 ]  # remove them (the test is less complete)
 
-                self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0)
-                self.assertNotEqual(
-                    tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(),
-                    0.0,
-                )
-                if len(attentions) > 2:  # encoder-decodere models have only 2 layers in each modules
-                    self.assertNotEqual(
-                        tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(),
-                        0.0,
-                    )
-                self.assertAlmostEqual(
-                    tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(),
-                    0.0,
-                )
-                self.assertNotEqual(
-                    tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(),
-                    0.0,
-                )
+                self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
+                if len(attentions) > 2:  # encoder-decoder models have only 2 layers in each module
+                    self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
 
             if model.config.is_encoder_decoder:
                 check_attentions_validity(outputs.encoder_attentions)
                 check_attentions_validity(outputs.decoder_attentions)
-                if "cross_attn_head_mask" in arg_names:
-                    check_attentions_validity(outputs.cross_attentions)
+                check_attentions_validity(outputs.cross_attentions)
             else:
                 check_attentions_validity(outputs.attentions)
 
-    def test_hidden_states_output(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+    def test_head_pruning(self):
+        if not self.test_pruning:
+            return
+
+        for model_class in self.all_model_classes:
+            (
+                config,
+                inputs_dict,
+            ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+            if "head_mask" in inputs_dict:
+                del inputs_dict["head_mask"]
+
+            inputs_dict["output_attentions"] = True
+            config.output_hidden_states = False
+            model = model_class(config=config)
+            model.to(torch_device)
+            model.eval()
+            heads_to_prune = {
+                0: list(range(1, self.model_tester.num_attention_heads)),
+                -1: [0],
+            }
+            model.prune_heads(heads_to_prune)
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            attentions = outputs[-1]
+
+            self.assertEqual(attentions[0].shape[-3], 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            self.assertEqual(
+                attentions[-1].shape[-3],
+                self.model_tester.num_attention_heads - 1,
+            )
+
+    def test_head_pruning_save_load_from_pretrained(self):
+        if not self.test_pruning:
+            return
+
+        for model_class in self.all_model_classes:
+            (
+                config,
+                inputs_dict,
+            ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+            if "head_mask" in inputs_dict:
+                del inputs_dict["head_mask"]
+
+            inputs_dict["output_attentions"] = True
+            config.output_hidden_states = False
+            model = model_class(config=config)
+            model.to(torch_device)
+            model.eval()
+            heads_to_prune = {
+                0: list(range(1, self.model_tester.num_attention_heads)),
+                -1: [0],
+            }
+            model.prune_heads(heads_to_prune)
+
+            with tempfile.TemporaryDirectory() as temp_dir_name:
+                model.save_pretrained(temp_dir_name)
+                model = model_class.from_pretrained(temp_dir_name)
+                model.to(torch_device)
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs[-1]
+            self.assertEqual(attentions[0].shape[-3], 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            self.assertEqual(
+                attentions[-1].shape[-3],
+                self.model_tester.num_attention_heads - 1,
+            )
+
+    def test_head_pruning_save_load_from_config_init(self):
+        if not self.test_pruning:
+            return
+
+        for model_class in self.all_model_classes:
+            (
+                config,
+                inputs_dict,
+            ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+            if "head_mask" in inputs_dict:
+                del inputs_dict["head_mask"]
+
+            inputs_dict["output_attentions"] = True
+            config.output_hidden_states = False
+
+            heads_to_prune = {
+                0: list(range(1, self.model_tester.num_attention_heads)),
+                -1: [0],
+            }
+            config.pruned_heads = heads_to_prune
+
+            model = model_class(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs[-1]
+
+            self.assertEqual(attentions[0].shape[-3], 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            self.assertEqual(
+                attentions[-1].shape[-3],
+                self.model_tester.num_attention_heads - 1,
+            )
+
+    def test_head_pruning_integration(self):
+        if not self.test_pruning:
+            return
+
+        for model_class in self.all_model_classes:
+            (
+                config,
+                inputs_dict,
+            ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+            if "head_mask" in inputs_dict:
+                del inputs_dict["head_mask"]
+
+            inputs_dict["output_attentions"] = True
+            config.output_hidden_states = False
+
+            heads_to_prune = {0: [0], 1: [1, 2]}
+            config.pruned_heads = heads_to_prune
+
+            model = model_class(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs[-1]
+
+            self.assertEqual(
+                attentions[0].shape[-3],
+                self.model_tester.num_attention_heads - 1,
+            )
+            self.assertEqual(
+                attentions[1].shape[-3],
+                self.model_tester.num_attention_heads - 2,
+            )
+            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
+            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
+
+            with tempfile.TemporaryDirectory() as temp_dir_name:
+                model.save_pretrained(temp_dir_name)
+                model = model_class.from_pretrained(temp_dir_name)
+                model.to(torch_device)
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs[-1]
+
+            self.assertEqual(
+                attentions[0].shape[-3],
+                self.model_tester.num_attention_heads - 1,
+            )
+            self.assertEqual(
+                attentions[1].shape[-3],
+                self.model_tester.num_attention_heads - 2,
+            )
+            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
+            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
 
-        def check_hidden_states_output(config, inputs_dict, model_class):
+            heads_to_prune = {0: [0], 2: [1, 2]}
+            model.prune_heads(heads_to_prune)
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs[-1]
+
+            self.assertEqual(
+                attentions[0].shape[-3],
+                self.model_tester.num_attention_heads - 1,
+            )
+            self.assertEqual(
+                attentions[1].shape[-3],
+                self.model_tester.num_attention_heads - 2,
+            )
+            self.assertEqual(
+                attentions[2].shape[-3],
+                self.model_tester.num_attention_heads - 2,
+            )
+            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
+
+            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
             expected_num_layers = getattr(
                 self.model_tester,
                 "expected_num_hidden_layers",
                 self.model_tester.num_hidden_layers + 1,
             )
+            self.assertEqual(len(hidden_states), expected_num_layers)
 
-            if model.config.is_encoder_decoder:
-                encoder_hidden_states = outputs.encoder_hidden_states
-                decoder_hidden_states = outputs.decoder_hidden_states
-
-                self.assertEqual(config.output_attentions, False)
-                self.assertEqual(len(encoder_hidden_states), expected_num_layers)
-                self.assertListEqual(
-                    list(encoder_hidden_states[0].shape[-2:]),
-                    [
-                        self.model_tester.seq_length,
-                        self.model_tester.hidden_size,
-                    ],
-                )
-                self.assertEqual(len(decoder_hidden_states), expected_num_layers)
-                self.assertListEqual(
-                    list(decoder_hidden_states[0].shape[-2:]),
-                    [
-                        self.model_tester.seq_length,
-                        self.model_tester.hidden_size,
-                    ],
-                )
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.encoder_seq_length
+                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
+                    seq_length = seq_length * self.model_tester.chunk_length
             else:
-                hidden_states = outputs.hidden_states
-                self.assertEqual(config.output_attentions, False)
+                seq_length = self.model_tester.seq_length
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.assertIsInstance(hidden_states, (list, tuple))
                 self.assertEqual(len(hidden_states), expected_num_layers)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+
                 self.assertListEqual(
                     list(hidden_states[0].shape[-2:]),
-                    [
-                        self.model_tester.seq_length,
-                        self.model_tester.hidden_size,
-                    ],
+                    [decoder_seq_length, self.model_tester.hidden_size],
                 )
 
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
         for model_class in self.all_model_classes:
             inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(config, inputs_dict, model_class)
+            check_hidden_states_output(inputs_dict, config, model_class)
 
+            # check that output_hidden_states also work using config
             del inputs_dict["output_hidden_states"]
             config.output_hidden_states = True
-            check_hidden_states_output(config, inputs_dict, model_class)
 
-    def test_model_common_attributes(self):
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_retain_grad_hidden_states_attentions(self):
         (
             config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
-        text_in_text_out_models = (
-            get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
-            + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
-            + get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING)
-        )
-        speech_in_text_out_models = get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING)
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        output = outputs[0]
+
+        if config.is_encoder_decoder:
+            # Seq2Seq models
+            encoder_hidden_states = outputs.encoder_hidden_states[0]
+            encoder_attentions = outputs.encoder_attentions[0]
+            encoder_hidden_states.retain_grad()
+            encoder_attentions.retain_grad()
+
+            decoder_hidden_states = outputs.decoder_hidden_states[0]
+            decoder_attentions = outputs.decoder_attentions[0]
+            decoder_hidden_states.retain_grad()
+            decoder_attentions.retain_grad()
 
+            cross_attentions = outputs.cross_attentions[0]
+            cross_attentions.retain_grad()
+
+            output.flatten()[0].backward(retain_graph=True)
+
+            self.assertIsNotNone(encoder_hidden_states.grad)
+            self.assertIsNotNone(encoder_attentions.grad)
+            self.assertIsNotNone(decoder_hidden_states.grad)
+            self.assertIsNotNone(decoder_attentions.grad)
+            self.assertIsNotNone(cross_attentions.grad)
+        else:
+            # Encoder-/Decoder-only models
+            hidden_states = outputs.hidden_states[0]
+            attentions = outputs.attentions[0]
+
+            hidden_states.retain_grad()
+            attentions.retain_grad()
+
+            output.flatten()[0].backward(retain_graph=True)
+
+            self.assertIsNotNone(hidden_states.grad)
+            self.assertIsNotNone(attentions.grad)
+
+    def test_feed_forward_chunking(self):
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
+            torch.manual_seed(0)
+            config = copy.deepcopy(original_config)
             model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-            if model_class in text_in_text_out_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            elif model_class in speech_in_text_out_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
+            model.to(torch_device)
+            model.eval()
+
+            hidden_states_no_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+
+            torch.manual_seed(0)
+            config.chunk_size_feed_forward = 1
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            hidden_states_with_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+            self.assertTrue(torch.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3))
+
+    def test_resize_position_vector_embeddings(self):
+        if not self.test_resize_position_embeddings:
+            return
+
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            if self.model_tester.is_training is False:
+                model.eval()
+
+            max_position_embeddings = config.max_position_embeddings
+
+            # Retrieve the embeddings and clone theme
+            if model.config.is_encoder_decoder:
+                (
+                    encoder_model_embed,
+                    decoder_model_embed,
+                ) = model.get_position_embeddings()
+                encoder_cloned_embeddings = encoder_model_embed.weight.clone()
+                decoder_cloned_embeddings = decoder_model_embed.weight.clone()
             else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
+                model_embed = model.get_position_embeddings()
+                cloned_embeddings = model_embed.weight.clone()
+
+            # Check that resizing the position embeddings with a larger max_position_embeddings increases
+            # the model's postion embeddings size
+            model.resize_position_embeddings(max_position_embeddings + 10)
+            self.assertEqual(
+                model.config.max_position_embeddings,
+                max_position_embeddings + 10,
+            )
 
-    def test_determinism(self):
+            # Check that it actually resizes the embeddings matrix
+            if model.config.is_encoder_decoder:
+                (
+                    encoder_model_embed,
+                    decoder_model_embed,
+                ) = model.get_position_embeddings()
+                self.assertEqual(
+                    encoder_model_embed.weight.shape[0],
+                    encoder_cloned_embeddings.shape[0] + 10,
+                )
+                self.assertEqual(
+                    decoder_model_embed.weight.shape[0],
+                    decoder_cloned_embeddings.shape[0] + 10,
+                )
+            else:
+                model_embed = model.get_position_embeddings()
+                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the position embeddings with a smaller max_position_embeddings decreases
+            # the model's max_position_embeddings
+            model.resize_position_embeddings(max_position_embeddings - 5)
+            self.assertEqual(
+                model.config.max_position_embeddings,
+                max_position_embeddings - 5,
+            )
+
+            # Check that it actually resizes the embeddings matrix
+            if model.config.is_encoder_decoder:
+                (
+                    encoder_model_embed,
+                    decoder_model_embed,
+                ) = model.get_position_embeddings()
+                self.assertEqual(
+                    encoder_model_embed.weight.shape[0],
+                    encoder_cloned_embeddings.shape[0] - 5,
+                )
+                self.assertEqual(
+                    decoder_model_embed.weight.shape[0],
+                    decoder_cloned_embeddings.shape[0] - 5,
+                )
+            else:
+                model_embed = model.get_position_embeddings()
+                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 5)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+
+            if model.config.is_encoder_decoder:
+                for p1, p2 in zip(encoder_cloned_embeddings, encoder_model_embed.weight):
+                    if p1.data.ne(p2.data).sum() > 0:
+                        models_equal = False
+                for p1, p2 in zip(decoder_cloned_embeddings, decoder_model_embed.weight):
+                    if p1.data.ne(p2.data).sum() > 0:
+                        models_equal = False
+            else:
+                for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                    if p1.data.ne(p2.data).sum() > 0:
+                        models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_resize_tokens_embeddings(self):
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            if self.model_tester.is_training is False:
+                model.eval()
+
+            model_vocab_size = config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = model_embed.weight.clone()
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+
+            # make sure that decoder_input_ids are resized as well
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_resize_embeddings_untied(self):
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        original_config.tie_word_embeddings = False
+
+        # if model cannot untied embeddings -> leave test
+        if original_config.tie_word_embeddings:
+            return
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config).to(torch_device)
+
+            # if no output embeddings -> leave test
+            if model.get_output_embeddings() is None:
+                continue
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_vocab_size = config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+    def test_model_common_attributes(self):
         (
             config,
             inputs_dict,
@@ -871,50 +1484,127 @@ def test_determinism(self):
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            first, second = (
-                model(
-                    self._prepare_for_class(inputs_dict, model_class),
-                    training=False,
-                )[0],
-                model(
-                    self._prepare_for_class(inputs_dict, model_class),
-                    training=False,
-                )[0],
-            )
-            out_1 = first.numpy()
-            out_2 = second.numpy()
-            out_1 = out_1[~np.isnan(out_1)]
-            out_2 = out_2[~np.isnan(out_2)]
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Embedding, AdaptiveEmbedding))
+            model.set_input_embeddings(nn.Embedding(10, 10))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_model_main_input_name(self):
+        for model_class in self.all_model_classes:
+            model_signature = inspect.signature(getattr(model_class, "forward"))
+            # The main input is the name of the argument after `self`
+            observed_main_input_name = list(model_signature.parameters.keys())[1]
+            self.assertEqual(model_class.main_input_name, observed_main_input_name)
+
+    def test_correct_missing_keys(self):
+        if not self.test_missing_keys:
+            return
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            base_model_prefix = model.base_model_prefix
+
+            if hasattr(model, base_model_prefix):
+                with tempfile.TemporaryDirectory() as temp_dir_name:
+                    model.base_model.save_pretrained(temp_dir_name)
+                    model, loading_info = model_class.from_pretrained(temp_dir_name, output_loading_info=True)
+                    with self.subTest(msg=f"Missing keys for {model.__class__.__name__}"):
+                        self.assertGreater(len(loading_info["missing_keys"]), 0)
+
+    def test_tie_model_weights(self):
+        if not self.test_torchscript:
+            return
+
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_same_values(layer_1, layer_2):
+            equal = True
+            for p1, p2 in zip(layer_1.weight, layer_2.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    equal = False
+            return equal
+
+        for model_class in self.all_model_classes:
+            config.torchscript = True
+            model_not_tied = model_class(config)
+            if model_not_tied.get_output_embeddings() is None:
+                continue
 
-    def test_model_outputs_equivalence(self):
+            config_tied = copy.deepcopy(config)
+            config_tied.torchscript = False
+            model_tied = model_class(config_tied)
+            params_tied = list(model_tied.parameters())
+            # Check that the embedding layer and decoding layer are the same in size and in value
+            # self.assertTrue(check_same_values(embeddings, decoding))
+
+            # # Check that after modification, they remain the same.
+            # embeddings.weight.data.div_(2)
+            # # Check that the embedding layer and decoding layer are the same in size and in value
+            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
+            # self.assertTrue(check_same_values(embeddings, decoding))
+
+            # # Check that after modification, they remain the same.
+            # decoding.weight.data.div_(4)
+            # # Check that the embedding layer and decoding layer are the same in size and in value
+            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
+            # self.assertTrue(check_same_values(embeddings, decoding))
+
+            # Check that after resize they remain tied.
+            model_tied.resize_token_embeddings(config.vocab_size + 10)
+            params_tied_2 = list(model_tied.parameters())
+            self.assertEqual(len(params_tied_2), len(params_tied))
+
+            # decoding.weight.data.mul_(20)
+            # # Check that the embedding layer and decoding layer are the same in size and in value
+            # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
+            # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
 
+    def test_model_outputs_equivalence(self):
         (
             config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
 
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
         def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
-            dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-            def recursive_check(tuple_object, dict_object):
-                if isinstance(tuple_object, (List, Tuple)):
-                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif tuple_object is None:
-                    return
-                else:
-                    self.assertTrue(
-                        all(tf.equal(tuple_object, dict_object)),
-                        msg=f"Tuple and dict output are not equal. Difference: {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}",
-                    )
+            with torch.no_grad():
+                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+                def recursive_check(tuple_object, dict_object):
+                    if isinstance(tuple_object, (List, Tuple)):
+                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif isinstance(tuple_object, Dict):
+                        for tuple_iterable_value, dict_iterable_value in zip(
+                            tuple_object.values(), dict_object.values()
+                        ):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif tuple_object is None:
+                        return
+                    else:
+                        self.assertTrue(
+                            torch.allclose(
+                                set_nan_tensor_to_zero(tuple_object),
+                                set_nan_tensor_to_zero(dict_object),
+                                atol=1e-5,
+                            ),
+                            msg=f"Tuple and dict output are not equal. Difference: {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`: {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}.",
+                        )
 
                 recursive_check(tuple_output, dict_output)
 
         for model_class in self.all_model_classes:
             model = model_class(config)
+            model.to(torch_device)
+            model.eval()
 
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
@@ -949,434 +1639,523 @@ def recursive_check(tuple_object, dict_object):
                 {"output_hidden_states": True, "output_attentions": True},
             )
 
-    def test_inputs_embeds(self):
+    @is_pt_tf_cross_test
+    def test_pt_tf_model_equivalence(self):
+        import numpy as np
+        import tensorflow as tf
+
+        import transformers
+
         (
             config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
-            model = model_class(config)
+            tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
 
-            inputs = copy.deepcopy(inputs_dict)
+            if not hasattr(transformers, tf_model_class_name):
+                # transformers does not have TF version yet
+                return
 
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
+            tf_model_class = getattr(transformers, tf_model_class_name)
 
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids)
-            else:
-                inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids)
+            config.output_hidden_states = True
 
-            inputs = self._prepare_for_class(inputs, model_class)
+            tf_model = tf_model_class(config)
+            pt_model = model_class(config)
 
-            model(inputs)
+            # make sure only tf inputs are forward that actually exist in function args
+            tf_input_keys = set(inspect.signature(tf_model.call).parameters.keys())
 
-    def test_numpy_arrays_inputs(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+            # remove all head masks
+            tf_input_keys.discard("head_mask")
+            tf_input_keys.discard("cross_attn_head_mask")
+            tf_input_keys.discard("decoder_head_mask")
 
-        def prepare_numpy_arrays(inputs_dict):
-            inputs_np_dict = {}
-            for k, v in inputs_dict.items():
-                if tf.is_tensor(v):
-                    inputs_np_dict[k] = v.numpy()
+            pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+            pt_inputs = {k: v for k, v in pt_inputs.items() if k in tf_input_keys}
+
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+            tf_inputs_dict = {}
+            for key, tensor in pt_inputs.items():
+                # skip key that does not exist in tf
+                if type(tensor) == bool:
+                    tf_inputs_dict[key] = tensor
+                elif key == "input_values":
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
+                elif key == "pixel_values":
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
+                elif key == "input_features":
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
                 else:
-                    inputs_np_dict[k] = np.array(k)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
 
-            return inputs_np_dict
+            # Check we can load pt model in tf and vice-versa with model => model functions
+            tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict)
+            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model).to(torch_device)
 
-        for model_class in self.all_model_classes:
-            model = model_class(config)
+            # need to rename encoder-decoder "inputs" for PyTorch
+            #            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
+            #                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
 
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            inputs_np = prepare_numpy_arrays(inputs)
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs)
+            tfo = tf_model(tf_inputs_dict, training=False)
 
-            output_for_dict_input = model(inputs_np)
-            output_for_kw_input = model(**inputs_np)
-            self.assert_outputs_same(output_for_dict_input, output_for_kw_input)
+            tf_hidden_states = tfo[0].numpy()
+            pt_hidden_states = pto[0].cpu().numpy()
 
-    def test_resize_token_embeddings(self):
-        if not self.test_resize_embeddings:
-            return
+            tf_nans = np.copy(np.isnan(tf_hidden_states))
+            pt_nans = np.copy(np.isnan(pt_hidden_states))
+
+            pt_hidden_states[tf_nans] = 0
+            tf_hidden_states[tf_nans] = 0
+            pt_hidden_states[pt_nans] = 0
+            tf_hidden_states[pt_nans] = 0
+
+            max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
+            self.assertLessEqual(max_diff, 4e-2)
+
+            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
+                torch.save(pt_model.state_dict(), pt_checkpoint_path)
+                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
+
+                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
+                tf_model.save_weights(tf_checkpoint_path)
+                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
+                pt_model = pt_model.to(torch_device)
+
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+            tf_inputs_dict = {}
+            for key, tensor in pt_inputs.items():
+                # skip key that does not exist in tf
+                if type(tensor) == bool:
+                    tensor = np.array(tensor, dtype=bool)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor, dtype=tf.int32)
+                elif key == "input_values":
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
+                elif key == "pixel_values":
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
+                elif key == "input_features":
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
+                else:
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
+
+            # need to rename encoder-decoder "inputs" for PyTorch
+            #            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
+            #                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
+
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs)
+
+            tfo = tf_model(tf_inputs_dict)
+            tfo = tfo[0].numpy()
+            pto = pto[0].cpu().numpy()
+            tf_nans = np.copy(np.isnan(tfo))
+            pt_nans = np.copy(np.isnan(pto))
+
+            pto[tf_nans] = 0
+            tfo[tf_nans] = 0
+            pto[pt_nans] = 0
+            tfo[pt_nans] = 0
+
+            max_diff = np.amax(np.abs(tfo - pto))
+            self.assertLessEqual(max_diff, 4e-2)
+
+    def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
+        diff = np.abs((a - b)).max()
+        self.assertLessEqual(
+            diff,
+            tol,
+            f"Difference between torch and flax is {diff} (>= {tol}).",
+        )
+
+    @is_pt_flax_cross_test
+    def test_equivalence_pt_to_flax(self):
         (
             config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
 
-        def _get_word_embedding_weight(model, embedding_layer):
-            embeds = getattr(embedding_layer, "weight", None)
-            if embeds is not None:
-                return embeds
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
 
-            embeds = getattr(embedding_layer, "decoder", None)
-            if embeds is not None:
-                return embeds
+                # load PyTorch class
+                pt_model = model_class(config).eval()
+                # Flax models don't use the `use_cache` option and cache is not returned as a default.
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
 
-            model(model.dummy_inputs)
+                fx_model_class_name = "Flax" + model_class.__name__
 
-            embeds = getattr(embedding_layer, "weight", None)
-            if embeds is not None:
-                return embeds
+                if not hasattr(transformers, fx_model_class_name):
+                    return
 
-            embeds = getattr(embedding_layer, "decoder", None)
-            if embeds is not None:
-                return embeds
+                fx_model_class = getattr(transformers, fx_model_class_name)
 
-            return None
+                # load Flax class
+                fx_model = fx_model_class(config, dtype=jnp.float32)
+                # make sure only flax inputs are forward that actually exist in function args
+                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
 
-        for model_class in self.all_model_classes:
-            for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
-                # build the embeddings
-                model = model_class(config=config)
-                old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
-                old_bias = model.get_bias()
-                old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
-                # reshape the embeddings
-                model.resize_token_embeddings(size)
-                new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
-                new_bias = model.get_bias()
-                new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
-
-                # check that the resized embeddings size matches the desired size.
-                assert_size = size if size is not None else config.vocab_size
-                self.assertEqual(new_input_embeddings.shape[0], assert_size)
-
-                # check that weights remain the same after resizing
-                models_equal = True
-                for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
-                    if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
-                        models_equal = False
-                self.assertTrue(models_equal)
-
-                if old_bias is not None and new_bias is not None:
-                    for old_weight, new_weight in zip(old_bias.values(), new_bias.values()):
-                        self.assertEqual(new_weight.shape[0], assert_size)
-
-                        models_equal = True
-                        for p1, p2 in zip(old_weight.value(), new_weight.value()):
-                            if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
-                                models_equal = False
-                        self.assertTrue(models_equal)
-
-                if old_output_embeddings is not None and new_output_embeddings is not None:
-                    self.assertEqual(new_output_embeddings.shape[0], assert_size)
-                    self.assertEqual(
-                        new_output_embeddings.shape[1],
-                        old_output_embeddings.shape[1],
-                    )
+                # prepare inputs
+                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                # remove function args that don't exist in Flax
+                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
+
+                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
+                fx_model.params = fx_state
 
-                    models_equal = True
-                    for p1, p2 in zip(
-                        old_output_embeddings.value(),
-                        new_output_embeddings.value(),
-                    ):
-                        if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
-                            models_equal = False
-                    self.assertTrue(models_equal)
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
 
-    def test_lm_head_model_random_no_beam_search_generate(self):
+                # convert inputs to Flax
+                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_outputs = fx_model(**fx_inputs).to_tuple()
+                self.assertEqual(
+                    len(fx_outputs),
+                    len(pt_outputs),
+                    "Output lengths differ between Flax and PyTorch",
+                )
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    pt_model.save_pretrained(tmpdirname)
+                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, from_pt=True)
+
+                fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple()
+                self.assertEqual(
+                    len(fx_outputs_loaded),
+                    len(pt_outputs),
+                    "Output lengths differ between Flax and PyTorch",
+                )
+                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
+                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2)
+
+    @is_pt_flax_cross_test
+    def test_equivalence_flax_to_pt(self):
         (
             config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
 
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                # load corresponding PyTorch class
+                pt_model = model_class(config).eval()
 
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_ids
-                with self.assertRaises(AssertionError):
-                    model.generate(do_sample=True, max_length=5)
-                # num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_ids, do_sample=True))
-            elif model_class.__name__ not in ["TFSpeech2TextForConditionalGeneration"]:
-                # Models with non-text inputs won't work here; num_return_sequences = 1
-                self._check_generated_ids(model.generate(do_sample=True, max_length=5))
-
-            with self.assertRaises(ValueError):
-                # generating multiple sequences when no beam search generation
-                # is not allowed as it would always generate the same sequences
-                model.generate(input_ids, do_sample=False, num_return_sequences=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(model.generate(input_ids, do_sample=True, num_return_sequences=2))
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [
-                self._generate_random_bad_tokens(1, model),
-                self._generate_random_bad_tokens(2, model),
-            ]
-            output_tokens = model.generate(
-                input_ids,
-                do_sample=True,
-                bad_words_ids=bad_words_ids,
-                num_return_sequences=2,
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_ids.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
+
+                fx_model_class_name = "Flax" + model_class.__name__
+
+                if not hasattr(transformers, fx_model_class_name):
+                    # no flax model exists for this class
+                    return
+
+                fx_model_class = getattr(transformers, fx_model_class_name)
+
+                # load Flax class
+                fx_model = fx_model_class(config, dtype=jnp.float32)
+                # make sure only flax inputs are forward that actually exist in function args
+                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
+
+                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
+
+                # make sure weights are tied in PyTorch
+                pt_model.tie_weights()
+
+                # prepare inputs
+                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                # remove function args that don't exist in Flax
+                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+
+                fx_outputs = fx_model(**fx_inputs).to_tuple()
+                self.assertEqual(
+                    len(fx_outputs),
+                    len(pt_outputs),
+                    "Output lengths differ between Flax and PyTorch",
+                )
+
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
 
-    def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    fx_model.save_pretrained(tmpdirname)
+                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
+
+                with torch.no_grad():
+                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
+
+                self.assertEqual(
+                    len(fx_outputs),
+                    len(pt_outputs_loaded),
+                    "Output lengths differ between Flax and PyTorch",
+                )
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+    def test_inputs_embeds(self):
         (
             config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-        if input_ids is None:
-            input_ids = inputs_dict.get("input_features", None)
 
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
+        for model_class in self.all_model_classes:
             model = model_class(config)
-            output_greedy = model.generate(
-                input_ids,
-                do_sample=False,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-            output_sample = model.generate(
-                input_ids,
-                do_sample=True,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
+            model.to(torch_device)
+            model.eval()
 
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_greedy, TFGreedySearchEncoderDecoderOutput)
-                self.assertIsInstance(output_sample, TFSampleEncoderDecoderOutput)
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
             else:
-                self.assertIsInstance(output_greedy, TFGreedySearchDecoderOnlyOutput)
-                self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput)
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
 
-    def test_lm_head_model_random_beam_search_generate(self):
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    @require_torch_multi_gpu
+    def test_multi_gpu_data_parallel_forward(self):
         (
             config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
 
-        for model_class in self.all_generative_model_classes:
+        # some params shouldn't be scattered by nn.DataParallel
+        # so just remove them if they are present.
+        blacklist_non_batched_params = [
+            "head_mask",
+            "decoder_head_mask",
+            "cross_attn_head_mask",
+        ]
+        for k in blacklist_non_batched_params:
+            inputs_dict.pop(k, None)
+
+        # move input tensors to cuda:O
+        for k, v in inputs_dict.items():
+            if torch.is_tensor(v):
+                inputs_dict[k] = v.to(0)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=config)
+            model.to(0)
+            model.eval()
+
+            # Wrap model in nn.DataParallel
+            model = nn.DataParallel(model)
+            with torch.no_grad():
+                _ = model(**self._prepare_for_class(inputs_dict, model_class))
+
+    @require_torch_multi_gpu
+    def test_model_parallelization(self):
+        if not self.test_model_parallel:
+            return
+
+        # a candidate for testing_utils
+        def get_current_gpu_memory_use():
+            """returns a list of cuda memory allocations per GPU in MBs"""
+
+            per_device_memory = []
+            for id in range(torch.cuda.device_count()):
+                with torch.cuda.device(id):
+                    per_device_memory.append(torch.cuda.memory_allocated() >> 20)
+
+            return per_device_memory
+
+        # Needs a large model to see the difference.
+        config = self.model_tester.get_large_model_config()
+
+        for model_class in self.all_parallelizable_model_classes:
+            torch.cuda.empty_cache()
+
+            # 1. single gpu memory load + unload + memory measurements
+            # Retrieve initial memory usage (can easily be ~0.6-1.5GB if cuda-kernels have been preloaded by previous tests)
+            memory_at_start = get_current_gpu_memory_use()
+
+            # Put model on device 0 and take a memory snapshot
             model = model_class(config)
+            model.to("cuda:0")
+            memory_after_model_load = get_current_gpu_memory_use()
 
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_ids, num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2))
-            else:
-                # num_return_sequences = 1
-                self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2))
-
-            with self.assertRaises(AssertionError):
-                # generating more sequences than having beams leads is not possible
-                model.generate(
-                    input_ids,
-                    do_sample=False,
-                    num_return_sequences=3,
-                    num_beams=2,
-                )
+            # The memory use on device 0 should be higher than it was initially.
+            self.assertGreater(memory_after_model_load[0], memory_at_start[0])
 
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(
-                model.generate(
-                    input_ids,
-                    do_sample=True,
-                    num_beams=2,
-                    num_return_sequences=2,
-                )
-            )
-            # num_return_sequences > 1, greedy
-            self._check_generated_ids(
-                model.generate(
-                    input_ids,
-                    do_sample=False,
-                    num_beams=2,
-                    num_return_sequences=2,
-                )
-            )
+            del model
+            gc.collect()
+            torch.cuda.empty_cache()
 
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [
-                self._generate_random_bad_tokens(1, model),
-                self._generate_random_bad_tokens(2, model),
-            ]
-            output_tokens = model.generate(
-                input_ids,
-                do_sample=False,
-                bad_words_ids=bad_words_ids,
-                num_beams=2,
-                num_return_sequences=2,
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_ids.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
+            # 2. MP test
+            # it's essential to re-calibrate the usage before the next stage
+            memory_at_start = get_current_gpu_memory_use()
+
+            # Spread model layers over multiple devices
+            model = model_class(config)
+            model.parallelize()
+            memory_after_parallelization = get_current_gpu_memory_use()
+
+            # Assert that the memory use on all devices is higher than it was when loaded only on CPU
+            for n in range(torch.cuda.device_count()):
+                self.assertGreater(memory_after_parallelization[n], memory_at_start[n])
+
+            # Assert that the memory use of device 0 is lower than it was when the entire model was loaded on it
+            self.assertLess(memory_after_parallelization[0], memory_after_model_load[0])
+
+            # Assert that the memory use of device 1 is higher than it was when the entire model was loaded
+            # on device 0 and device 1 wasn't used at all
+            self.assertGreater(memory_after_parallelization[1], memory_after_model_load[1])
+
+            del model
+            gc.collect()
+            torch.cuda.empty_cache()
+
+    @require_torch_multi_gpu
+    def test_model_parallel_equal_results(self):
+        if not self.test_model_parallel:
+            return
 
-    def test_lm_head_model_beam_search_generate_dict_outputs(self):
         (
             config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-        if input_ids is None:
-            input_ids = inputs_dict.get("input_features", None)
 
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
+        for model_class in self.all_parallelizable_model_classes:
+            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+
+            def cast_to_device(dictionary, device):
+                output = {}
+                for k, v in dictionary.items():
+                    if isinstance(v, torch.Tensor):
+                        output[k] = v.to(device)
+                    else:
+                        output[k] = v
+
+                return output
+
             model = model_class(config)
-            output_beam_search = model.generate(
-                input_ids,
-                num_beams=2,
-                do_sample=False,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-            output_beam_sample = model.generate(
-                input_ids,
-                num_beams=2,
-                do_sample=True,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
+            output = model(**cast_to_device(inputs_dict, "cpu"))
 
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_beam_search, TFBeamSearchEncoderDecoderOutput)
-                self.assertIsInstance(output_beam_sample, TFBeamSampleEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_beam_search, TFBeamSearchDecoderOnlyOutput)
-                self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput)
+            model.parallelize()
+
+            parallel_output = model(**cast_to_device(inputs_dict, "cuda:0"))
+
+            for value, parallel_value in zip(output, parallel_output):
+                if isinstance(value, torch.Tensor):
+                    self.assertTrue(torch.allclose(value, parallel_value.to("cpu"), atol=1e-7))
+                elif isinstance(value, (Tuple, List)):
+                    for value_, parallel_value_ in zip(value, parallel_value):
+                        self.assertTrue(torch.allclose(value_, parallel_value_.to("cpu"), atol=1e-7))
+
+    @require_torch_multi_gpu
+    def test_model_parallel_beam_search(self):
+        if not self.test_model_parallel:
+            return
+
+        all_generative_and_parallelizable_model_classes = tuple(
+            set(self.all_generative_model_classes).intersection(self.all_parallelizable_model_classes)
+        )
 
-    def test_loss_computation(self):
         (
             config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
+
+        for model_class in all_generative_and_parallelizable_model_classes:
+            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
             model = model_class(config)
-            if getattr(model, "hf_compute_loss", None):
-                # The number of elements in the loss should be the same as the number of elements in the label
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                added_label = prepared_for_class[
-                    sorted(
-                        list(prepared_for_class.keys() - inputs_dict.keys()),
-                        reverse=True,
-                    )[0]
-                ]
-                loss_size = tf.size(added_label)
 
-                if model.__class__ in get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING):
-                    # if loss is causal lm loss, labels are shift, so that one label per batch
-                    # is cut
-                    loss_size = loss_size - self.model_tester.batch_size
+            def cast_to_device(dictionary, device):
+                output = {}
+                for k, v in dictionary.items():
+                    if isinstance(v, torch.Tensor):
+                        output[k] = v.to(device)
+                    else:
+                        output[k] = v
 
-                # Test that model correctly compute the loss with kwargs
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                possible_input_names = {
-                    "input_ids",
-                    "pixel_values",
-                    "input_features",
-                }
-                input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
-                model_input = prepared_for_class.pop(input_name)
-
-                loss = model(model_input, **prepared_for_class)[0]
-                self.assertEqual(loss.shape, [loss_size])
-
-                # Test that model correctly compute the loss with a dict
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                loss = model(prepared_for_class)[0]
-                self.assertEqual(loss.shape, [loss_size])
-
-                # Test that model correctly compute the loss with a tuple
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-
-                # Get keys that were added with the _prepare_for_class function
-                label_keys = prepared_for_class.keys() - inputs_dict.keys()
-                signature = inspect.signature(model.call).parameters
-                signature_names = list(signature.keys())
-
-                # Create a dictionary holding the location of the tensors in the tuple
-                tuple_index_mapping = {0: input_name}
-                for label_key in label_keys:
-                    label_key_index = signature_names.index(label_key)
-                    tuple_index_mapping[label_key_index] = label_key
-                sorted_tuple_index_mapping = sorted(tuple_index_mapping.items())
-                # Initialize a list with their default values, update the values and convert to a tuple
-                list_input = []
-
-                for name in signature_names:
-                    if name != "kwargs":
-                        list_input.append(signature[name].default)
-
-                for index, value in sorted_tuple_index_mapping:
-                    list_input[index] = prepared_for_class[value]
-
-                tuple_input = tuple(list_input)
-
-                # Send to model
-                loss = model(tuple_input[:-1])[0]
-
-                self.assertEqual(loss.shape, [loss_size])
-
-    def test_generate_with_headmasking(self):
-        attention_names = [
-            "encoder_attentions",
-            "decoder_attentions",
-            "cross_attentions",
-        ]
+                return output
+
+            model.parallelize()
+            model.generate(**cast_to_device(inputs_dict, "cuda:0"), num_beams=2)
+
+    def test_problem_types(self):
         (
             config,
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
 
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
+        problem_types = [
+            {
+                "title": "multi_label_classification",
+                "num_labels": 2,
+                "dtype": torch.float,
+            },
+            {
+                "title": "single_label_classification",
+                "num_labels": 1,
+                "dtype": torch.long,
+            },
+            {"title": "regression", "num_labels": 1, "dtype": torch.float},
+        ]
 
-            # We want to test only encoder-decoder models
-            if not config.is_encoder_decoder:
+        for model_class in self.all_model_classes:
+            if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
                 continue
 
-            head_masking = {
-                "head_mask": tf.zeros((config.encoder_layers, config.encoder_attention_heads)),
-                "decoder_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
-                "cross_attn_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
-            }
+            for problem_type in problem_types:
+                with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
 
-            signature = inspect.signature(model.call)
-            if set(head_masking.keys()) < set([*signature.parameters.keys()]):
-                continue
+                    config.problem_type = problem_type["title"]
+                    config.num_labels = problem_type["num_labels"]
 
-            for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
-                out = model.generate(
-                    inputs_dict["input_ids"],
-                    num_beams=1,
-                    max_length=inputs_dict["input_ids"] + 5,
-                    output_attentions=True,
-                    return_dict_in_generate=True,
-                    **{name: mask},
-                )
-                # We check the state of decoder_attentions and cross_attentions just from the last step
-                attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
-                self.assertEqual(sum([tf.reduce_sum(w).numpy() for w in attn_weights]), 0.0)
+                    model = model_class(config)
+                    model.to(torch_device)
+                    model.train()
+
+                    inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+
+                    if problem_type["num_labels"] > 1:
+                        inputs["labels"] = inputs["labels"].unsqueeze(1).repeat(1, problem_type["num_labels"])
+
+                    inputs["labels"] = inputs["labels"].to(problem_type["dtype"])
+
+                    # This tests that we do not trigger the warning form PyTorch "Using a target size that is different
+                    # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure
+                    # they have the same size." which is a symptom something in wrong for the regression problem.
+                    # See https://github.com/huggingface/transformers/issues/11780
+                    with warnings.catch_warnings(record=True) as warning_list:
+                        loss = model(**inputs).loss
+                    for w in warning_list:
+                        if "Using a target size that is different to the input size" in str(w.message):
+                            raise ValueError(
+                                f"Something is going wrong in the regression problem: intercepted {w.message}"
+                            )
+
+                    loss.backward()
 
     def test_load_with_mismatched_shapes(self):
         if not self.test_mismatched_shapes:
@@ -1387,93 +2166,54 @@ def test_load_with_mismatched_shapes(self):
         ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
-            if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
+            if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
                 continue
 
             with self.subTest(msg=f"Testing {model_class}"):
                 with tempfile.TemporaryDirectory() as tmp_dir:
                     model = model_class(config)
-                    inputs = self._prepare_for_class(inputs_dict, model_class)
-                    _ = model(**inputs)
                     model.save_pretrained(tmp_dir)
 
                     # Fails when we don't set ignore_mismatched_sizes=True
-                    with self.assertRaises(ValueError):
-                        new_model = TFAutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42)
-                    with self.assertRaises(ValueError):
-                        new_model_without_prefix = TFAutoModel.from_pretrained(tmp_dir, vocab_size=10)
+                    with self.assertRaises(RuntimeError):
+                        new_model = AutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42)
+                    with self.assertRaises(RuntimeError):
+                        new_model_without_prefix = AutoModel.from_pretrained(tmp_dir, vocab_size=10)
+
+                    logger = logging.get_logger("transformers.modeling_utils")
 
-                    logger = logging.get_logger("transformers.modeling_tf_utils")
                     with CaptureLogger(logger) as cl:
-                        new_model = TFAutoModelForSequenceClassification.from_pretrained(
-                            tmp_dir, num_labels=42, ignore_mismatched_sizes=True
+                        new_model = AutoModelForSequenceClassification.from_pretrained(
+                            tmp_dir,
+                            num_labels=42,
+                            ignore_mismatched_sizes=True,
                         )
                     self.assertIn("the shapes did not match", cl.out)
-
+                    new_model.to(torch_device)
+                    inputs = self._prepare_for_class(inputs_dict, model_class)
                     logits = new_model(**inputs).logits
                     self.assertEqual(logits.shape[1], 42)
 
                     with CaptureLogger(logger) as cl:
-                        new_model_without_prefix = TFAutoModel.from_pretrained(
+                        new_model_without_prefix = AutoModel.from_pretrained(
                             tmp_dir, vocab_size=10, ignore_mismatched_sizes=True
                         )
                     self.assertIn("the shapes did not match", cl.out)
-
-                    # Although Tf models always have a prefix pointing to `MainLayer`,
-                    # we still add this "without prefix" test to keep a consistency between tf and pt tests.
                     input_ids = ids_tensor((2, 8), 10)
+                    new_model_without_prefix.to(torch_device)
                     if self.is_encoder_decoder:
                         new_model_without_prefix(input_ids, decoder_input_ids=input_ids)
                     else:
                         new_model_without_prefix(input_ids)
 
-    def test_model_main_input_name(self):
-        for model_class in self.all_model_classes:
-            model_signature = inspect.signature(getattr(model_class, "call"))
-            # The main input is the name of the argument after `self`
-            observed_main_input_name = list(model_signature.parameters.keys())[1]
-            self.assertEqual(model_class.main_input_name, observed_main_input_name)
 
-    def _generate_random_bad_tokens(self, num_bad_tokens, model):
-        # special tokens cannot be bad tokens
-        special_tokens = []
-        if model.config.bos_token_id is not None:
-            special_tokens.append(model.config.bos_token_id)
-        if model.config.pad_token_id is not None:
-            special_tokens.append(model.config.pad_token_id)
-        if model.config.eos_token_id is not None:
-            special_tokens.append(model.config.eos_token_id)
-
-        # create random bad tokens that are not special tokens
-        bad_tokens = []
-        while len(bad_tokens) < num_bad_tokens:
-            token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0]
-            if token not in special_tokens:
-                bad_tokens.append(token)
-        return bad_tokens
-
-    def _check_generated_ids(self, output_ids):
-        for token_id in output_ids[0].numpy().tolist():
-            self.assertGreaterEqual(token_id, 0)
-            self.assertLess(token_id, self.model_tester.vocab_size)
-
-    def _check_match_tokens(self, generated_ids, bad_words_ids):
-        # for all bad word tokens
-        for bad_word_ids in bad_words_ids:
-            # for all slices in batch
-            for generated_ids_slice in generated_ids:
-                # for all word idx
-                for i in range(len(bad_word_ids), len(generated_ids_slice)):
-                    # if tokens match
-                    if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids:
-                        return True
-        return False
-
-
-def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
-    """Creates a random int32 tensor of the shape within the vocab size."""
+global_rng = random.Random()
+
+
+def ids_tensor(shape, vocab_size, rng=None, name=None):
+    #  Creates a random int32 tensor of the shape within the vocab size
     if rng is None:
-        rng = random.Random()
+        rng = global_rng
 
     total_dims = 1
     for dim in shape:
@@ -1483,28 +2223,20 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
     for _ in range(total_dims):
         values.append(rng.randint(0, vocab_size - 1))
 
-    output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32)
+    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
 
-    return output
 
-
-def random_attention_mask(shape, rng=None, name=None, dtype=None):
-    attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype)
+def random_attention_mask(shape, rng=None, name=None):
+    attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None)
     # make sure that at least one token is attended to for each batch
-    attn_mask = tf.concat(
-        [
-            tf.constant(value=1, shape=(shape[0], 1), dtype=dtype),
-            attn_mask[:, 1:],
-        ],
-        axis=1,
-    )
+    attn_mask[:, -1] = 1
     return attn_mask
 
 
-def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
+def floats_tensor(shape, scale=1.0, rng=None, name=None):
     """Creates a random float32 tensor"""
     if rng is None:
-        rng = random.Random()
+        rng = global_rng
 
     total_dims = 1
     for dim in shape:
@@ -1514,134 +2246,128 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
     for _ in range(total_dims):
         values.append(rng.random() * scale)
 
-    return tf.reshape(
-        tf.constant(values, dtype=dtype if dtype is not None else tf.float32),
-        shape=shape,
-    )
+    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
 
 
-@require_tf
-class UtilsFunctionsTest(unittest.TestCase):
-
-    # tests whether the top_k_top_p_filtering function behaves as expected
-    def test_top_k_top_p_filtering(self):
-        logits = tf.convert_to_tensor(
-            [
-                [
-                    8.2220991,  # 3rd highest value; idx. 0
-                    -0.5620044,
-                    5.23229752,
-                    4.0386393,
-                    -6.8798378,
-                    -0.54785802,
-                    -3.2012153,
-                    2.92777176,
-                    1.88171953,
-                    7.35341276,  # 5th highest value; idx. 9
-                    8.43207833,  # 2nd highest value; idx. 10
-                    -9.85711836,
-                    -5.96209236,
-                    -1.13039161,
-                    -7.1115294,
-                    -0.8369633,
-                    -5.3186408,
-                    7.06427407,
-                    0.81369344,
-                    -0.82023817,
-                    -5.9179796,
-                    0.58813443,
-                    -6.99778438,
-                    4.71551189,
-                    -0.18771637,
-                    7.44020759,  # 4th highest value; idx. 25
-                    9.38450987,  # 1st highest value; idx. 26
-                    2.12662941,
-                    -9.32562038,
-                    2.35652522,
-                ],  # cummulative prob of 5 highest values <= 0.6
-                [
-                    0.58425518,
-                    4.53139238,
-                    -5.57510464,
-                    -6.28030699,
-                    -7.19529503,
-                    -4.02122551,
-                    1.39337037,
-                    -6.06707057,
-                    1.59480517,
-                    -9.643119,
-                    0.03907799,
-                    0.67231762,
-                    -8.88206726,
-                    6.27115922,  # 4th highest value; idx. 13
-                    2.28520723,
-                    4.82767506,
-                    4.30421368,
-                    8.8275313,  # 2nd highest value; idx. 17
-                    5.44029958,  # 5th highest value; idx. 18
-                    -4.4735794,
-                    7.38579536,  # 3rd highest value; idx. 20
-                    -2.91051663,
-                    2.61946077,
-                    -2.5674762,
-                    -9.48959302,
-                    -4.02922645,
-                    -1.35416918,
-                    9.67702323,  # 1st highest value; idx. 27
-                    -5.89478553,
-                    1.85370467,
-                ],  # cummulative prob of 5 highest values <= 0.6
-            ],
-            dtype=tf.float32,
-        )
+@require_torch
+class ModelUtilsTest(TestCasePlus):
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = BertConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, PretrainedConfig)
 
-        non_inf_expected_idx = tf.convert_to_tensor(
-            [
-                [0, 0],
-                [0, 9],
-                [0, 10],
-                [0, 25],
-                [0, 26],
-                [1, 13],
-                [1, 17],
-                [1, 18],
-                [1, 20],
-                [1, 27],
-            ],
-            dtype=tf.int32,
-        )  # expected non filtered idx as noted above
-
-        non_inf_expected_output = tf.convert_to_tensor(
-            [
-                8.222099,
-                7.3534126,
-                8.432078,
-                7.4402075,
-                9.38451,
-                6.271159,
-                8.827531,
-                5.4402995,
-                7.3857956,
-                9.677023,
-            ],
-            dtype=tf.float32,
-        )  # expected non filtered values as noted above
-
-        output = tf_top_k_top_p_filtering(logits, top_k=10, top_p=0.6, min_tokens_to_keep=4)
-
-        non_inf_output = output[output != -float("inf")]
-        non_inf_idx = tf.cast(
-            tf.where(tf.not_equal(output, tf.constant(-float("inf"), dtype=tf.float32))),
-            dtype=tf.int32,
-        )
+            model = BertModel.from_pretrained(model_name)
+            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, PreTrainedModel)
 
-        tf.debugging.assert_near(non_inf_output, non_inf_expected_output, rtol=1e-12)
-        tf.debugging.assert_equal(non_inf_idx, non_inf_expected_idx)
+            self.assertEqual(len(loading_info["missing_keys"]), 0)
+            self.assertEqual(len(loading_info["unexpected_keys"]), 8)
+            self.assertEqual(len(loading_info["mismatched_keys"]), 0)
+            self.assertEqual(len(loading_info["error_msgs"]), 0)
 
+            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
 
-@require_tf
+            # Not sure this is the intended behavior. TODO fix Lysandre & Thom
+            config.name_or_path = model_name
+
+            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            self.assertEqual(model.config.output_hidden_states, True)
+            self.assertEqual(model.config, config)
+
+    def test_model_from_pretrained_with_different_pretrained_model_name(self):
+        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
+        self.assertIsNotNone(model)
+
+        logger = logging.get_logger("transformers.configuration_utils")
+        with CaptureLogger(logger) as cl:
+            BertModel.from_pretrained(TINY_T5)
+        self.assertTrue("You are using a model of type t5 to instantiate a model of type bert" in cl.out)
+
+    @require_torch
+    def test_model_from_config_torch_dtype(self):
+        # test that the model can be instantiated with dtype of user's choice - as long as it's a
+        # float dtype. To make it happen config.torch_dtype needs to be set before instantiating the
+        # model from the config object.
+
+        config = T5Config.from_pretrained(TINY_T5)
+        model = AutoModel.from_config(config)
+        # XXX: isn't supported
+        # model = T5ForConditionalGeneration.from_config(config)
+        self.assertEqual(model.dtype, torch.float32)
+
+        model = AutoModel.from_config(config, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # torch.set_default_dtype() supports only float dtypes, so will fail with non-float type
+        with self.assertRaises(ValueError):
+            model = AutoModel.from_config(config, torch_dtype=torch.int64)
+
+    @require_torch
+    def test_model_from_pretrained_torch_dtype(self):
+        # test that the model can be instantiated with dtype of either
+        # 1. explicit from_pretrained's torch_dtype argument
+        # 2. via autodiscovery by looking at model weights (torch_dtype="auto")
+        # so if a model.half() was saved, we want it to be instantiated as such.
+        #
+        # test an explicit model class, but also AutoModel separately as the latter goes through a different code path
+        model_path = self.get_auto_remove_tmp_dir()
+
+        # baseline - we know TINY_T5 is fp32 model
+        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
+        self.assertEqual(model.dtype, torch.float32)
+
+        # test the default fp32 save_pretrained => from_pretrained cycle
+        model.save_pretrained(model_path)
+        model = T5ForConditionalGeneration.from_pretrained(model_path)
+        self.assertEqual(model.dtype, torch.float32)
+        # test with auto-detection
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float32)
+
+        # test forced loading in fp16 (even though the weights are in fp32)
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # test fp16 save_pretrained, loaded with auto-detection
+        model = model.half()
+        model.save_pretrained(model_path)
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.config.torch_dtype, torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # tests `config.torch_dtype` saving
+        with open(f"{model_path}/config.json") as f:
+            config_dict = json.load(f)
+        self.assertEqual(config_dict["torch_dtype"], "float16")
+
+        # test fp16 save_pretrained, loaded with the explicit fp16
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # test AutoModel separately as it goes through a different path
+        # test auto-detection
+        model = AutoModel.from_pretrained(TINY_T5, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float32)
+        # test forcing an explicit dtype
+        model = AutoModel.from_pretrained(TINY_T5, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+    def test_no_super_init_config_and_model(self):
+        config = NoSuperInitConfig(attribute=32)
+        model = NoSuperInitModel(config)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+
+            model = NoSuperInitModel.from_pretrained(tmp_dir)
+
+
+@require_torch
 @is_staging_test
-class TFModelPushToHubTester(unittest.TestCase):
+class ModelPushToHubTester(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls._token = login(username=USER, password=PASS)
@@ -1649,19 +2375,29 @@ def setUpClass(cls):
     @classmethod
     def tearDownClass(cls):
         try:
-            delete_repo(token=cls._token, name="test-model-tf")
+            delete_repo(token=cls._token, name="test-model")
         except HTTPError:
             pass
 
         try:
             delete_repo(
                 token=cls._token,
-                name="test-model-tf-org",
+                name="test-model-org",
                 organization="valid_org",
             )
         except HTTPError:
             pass
 
+        try:
+            delete_repo(token=cls._token, name="test-dynamic-model")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, name="test-dynamic-model-config")
+        except HTTPError:
+            pass
+
     def test_push_to_hub(self):
         config = BertConfig(
             vocab_size=99,
@@ -1670,35 +2406,17 @@ def test_push_to_hub(self):
             num_attention_heads=4,
             intermediate_size=37,
         )
-        model = TFBertModel(config)
-        # Make sure model is properly initialized
-        _ = model(model.dummy_inputs)
+        model = BertModel(config)
         with tempfile.TemporaryDirectory() as tmp_dir:
             model.save_pretrained(
-                os.path.join(tmp_dir, "test-model-tf"),
+                os.path.join(tmp_dir, "test-model"),
                 push_to_hub=True,
                 use_auth_token=self._token,
             )
 
-            new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
-            models_equal = True
-            for p1, p2 in zip(model.weights, new_model.weights):
-                if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
-                    models_equal = False
-            self.assertTrue(models_equal)
-
-    def test_push_to_hub_with_model_card(self):
-        config = BertConfig(
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-        )
-        model = TFBertModel(config)
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.push_to_hub(os.path.join(tmp_dir, "test-model-tf"))
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "test-model-card-tf", "README.md")))
+            new_model = BertModel.from_pretrained(f"{USER}/test-model")
+            for p1, p2 in zip(model.parameters(), new_model.parameters()):
+                self.assertTrue(torch.equal(p1, p2))
 
     def test_push_to_hub_in_organization(self):
         config = BertConfig(
@@ -1708,18 +2426,50 @@ def test_push_to_hub_in_organization(self):
             num_attention_heads=4,
             intermediate_size=37,
         )
-        model = TFBertModel(config)
+        model = BertModel(config)
         with tempfile.TemporaryDirectory() as tmp_dir:
             model.save_pretrained(
-                os.path.join(tmp_dir, "test-model-tf-org"),
+                os.path.join(tmp_dir, "test-model-org"),
                 push_to_hub=True,
                 use_auth_token=self._token,
                 organization="valid_org",
             )
 
-            new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
-            models_equal = True
-            for p1, p2 in zip(model.weights, new_model.weights):
-                if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
-                    models_equal = False
-            self.assertTrue(models_equal)
+            new_model = BertModel.from_pretrained("valid_org/test-model-org")
+            for p1, p2 in zip(model.parameters(), new_model.parameters()):
+                self.assertTrue(torch.equal(p1, p2))
+
+    def test_push_to_hub_dynamic_model(self):
+        CustomConfig.register_for_auto_class()
+        CustomModel.register_for_auto_class()
+
+        config = CustomConfig(hidden_size=32)
+        model = CustomModel(config)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            repo = Repository(
+                tmp_dir,
+                clone_from=f"{USER}/test-dynamic-model",
+                use_auth_token=self._token,
+            )
+            model.save_pretrained(tmp_dir)
+            # checks
+            self.assertDictEqual(
+                config.auto_map,
+                {
+                    "AutoConfig": "custom_configuration.CustomConfig",
+                    "AutoModel": "custom_modeling.CustomModel",
+                },
+            )
+
+            repo.push_to_hub()
+
+        new_model = AutoModel.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
+        # Can't make an isinstance check because the new_model is from the CustomModel class of a dynamic module
+        self.assertEqual(new_model.__class__.__name__, "CustomModel")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
+        new_model = AutoModel.from_config(config, trust_remote_code=True)
+        self.assertEqual(new_model.__class__.__name__, "CustomModel")

From 4dea175013e59ff1838185f06aa6e60c95a3fd66 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 24 Feb 2022 15:45:46 +0530
Subject: [PATCH 57/65] chore: revert to the original test_modeling_common.py

---
 tests/test_modeling_common.py | 452 +++++++---------------------------
 1 file changed, 86 insertions(+), 366 deletions(-)

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 348ffcd2c4490..17888bcfac380 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -140,21 +140,13 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
 
         if return_labels:
             if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs_dict["labels"] = torch.ones(
-                    self.model_tester.batch_size,
-                    dtype=torch.long,
-                    device=torch_device,
-                )
+                inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
             elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                 inputs_dict["start_positions"] = torch.zeros(
-                    self.model_tester.batch_size,
-                    dtype=torch.long,
-                    device=torch_device,
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
                 )
                 inputs_dict["end_positions"] = torch.zeros(
-                    self.model_tester.batch_size,
-                    dtype=torch.long,
-                    device=torch_device,
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
                 )
             elif model_class in [
                 *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
@@ -162,9 +154,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
             ]:
                 inputs_dict["labels"] = torch.zeros(
-                    self.model_tester.batch_size,
-                    dtype=torch.long,
-                    device=torch_device,
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
                 )
             elif model_class in [
                 *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
@@ -174,27 +164,17 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
             ]:
                 inputs_dict["labels"] = torch.zeros(
-                    (
-                        self.model_tester.batch_size,
-                        self.model_tester.seq_length,
-                    ),
-                    dtype=torch.long,
-                    device=torch_device,
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
             elif model_class in get_values(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING):
                 num_patches = self.model_tester.image_size // self.model_tester.patch_size
                 inputs_dict["bool_masked_pos"] = torch.zeros(
-                    (self.model_tester.batch_size, num_patches ** 2),
-                    dtype=torch.long,
-                    device=torch_device,
+                    (self.model_tester.batch_size, num_patches**2), dtype=torch.long, device=torch_device
                 )
         return inputs_dict
 
     def test_save_load(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -220,10 +200,7 @@ def test_save_load(self):
                 self.assertLessEqual(max_diff, 1e-5)
 
     def test_save_load_keys_to_ignore_on_save(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -233,11 +210,7 @@ def test_save_load_keys_to_ignore_on_save(self):
 
             # check the keys are in the original state_dict
             for k in _keys_to_ignore_on_save:
-                self.assertIn(
-                    k,
-                    model.state_dict().keys(),
-                    "\n".join(model.state_dict().keys()),
-                )
+                self.assertIn(k, model.state_dict().keys(), "\n".join(model.state_dict().keys()))
 
             # check that certain keys didn't get saved with the model
             with tempfile.TemporaryDirectory() as tmpdirname:
@@ -245,11 +218,7 @@ def test_save_load_keys_to_ignore_on_save(self):
                 output_model_file = os.path.join(tmpdirname, WEIGHTS_NAME)
                 state_dict_saved = torch.load(output_model_file)
                 for k in _keys_to_ignore_on_save:
-                    self.assertNotIn(
-                        k,
-                        state_dict_saved.keys(),
-                        "\n".join(state_dict_saved.keys()),
-                    )
+                    self.assertNotIn(k, state_dict_saved.keys(), "\n".join(state_dict_saved.keys()))
 
                 # Test we can load the state dict in the model, necessary for the checkpointing API in Trainer.
                 load_result = model.load_state_dict(state_dict_saved, strict=False)
@@ -260,10 +229,7 @@ def test_save_load_keys_to_ignore_on_save(self):
                 self.assertTrue(len(load_result.unexpected_keys) == 0)
 
     def test_gradient_checkpointing_backward_compatibility(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             if not model_class.supports_gradient_checkpointing:
@@ -274,10 +240,7 @@ def test_gradient_checkpointing_backward_compatibility(self):
             self.assertTrue(model.is_gradient_checkpointing)
 
     def test_gradient_checkpointing_enable_disable(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             if not model_class.supports_gradient_checkpointing:
@@ -302,10 +265,7 @@ def _mock_init_weights(self, module):
             module.bias.data.fill_(3)
 
     def test_save_load_fast_init_from_base(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         base_class = MODEL_MAPPING[config.__class__]
 
         if isinstance(base_class, tuple):
@@ -350,10 +310,7 @@ class CopyClass(model_class):
                     self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
 
     def test_save_load_fast_init_to_base(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         base_class = MODEL_MAPPING[config.__class__]
 
         if isinstance(base_class, tuple):
@@ -399,10 +356,7 @@ class CopyClass(base_class):
                     self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
 
     def test_initialization(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         configs_no_init = _config_zero_init(config)
         for model_class in self.all_model_classes:
@@ -416,10 +370,7 @@ def test_initialization(self):
                     )
 
     def test_determinism(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -453,12 +404,7 @@ def test_forward_signature(self):
                     "decoder_attention_mask",
                 ]
                 expected_arg_names.extend(
-                    [
-                        "head_mask",
-                        "decoder_head_mask",
-                        "cross_attn_head_mask",
-                        "encoder_outputs",
-                    ]
+                    ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
                     if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
                     else ["encoder_outputs"]
                 )
@@ -472,10 +418,7 @@ def test_training(self):
             return
 
         for model_class in self.all_model_classes:
-            (
-                config,
-                inputs_dict,
-            ) = self.model_tester.prepare_config_and_inputs_for_common()
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             config.return_dict = True
 
             if model_class in get_values(MODEL_MAPPING):
@@ -493,10 +436,7 @@ def test_training_gradient_checkpointing(self):
             return
 
         for model_class in self.all_model_classes:
-            (
-                config,
-                inputs_dict,
-            ) = self.model_tester.prepare_config_and_inputs_for_common()
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             config.use_cache = False
             config.return_dict = True
 
@@ -511,10 +451,7 @@ def test_training_gradient_checkpointing(self):
             loss.backward()
 
     def test_attention_outputs(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
 
         seq_len = getattr(self.model_tester, "seq_length", None)
@@ -552,21 +489,12 @@ def test_attention_outputs(self):
             if chunk_length is not None:
                 self.assertListEqual(
                     list(attentions[0].shape[-4:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        encoder_seq_length,
-                        chunk_length,
-                        encoder_key_length,
-                    ],
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
                 )
             else:
                 self.assertListEqual(
                     list(attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        encoder_seq_length,
-                        encoder_key_length,
-                    ],
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
                 )
             out_len = len(outputs)
 
@@ -590,11 +518,7 @@ def test_attention_outputs(self):
                 self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
                     list(decoder_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        decoder_seq_length,
-                        decoder_key_length,
-                    ],
+                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
                 )
 
                 # cross attentions
@@ -633,46 +557,28 @@ def test_attention_outputs(self):
             if chunk_length is not None:
                 self.assertListEqual(
                     list(self_attentions[0].shape[-4:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        encoder_seq_length,
-                        chunk_length,
-                        encoder_key_length,
-                    ],
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
                 )
             else:
                 self.assertListEqual(
                     list(self_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        encoder_seq_length,
-                        encoder_key_length,
-                    ],
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
                 )
 
     @slow
     def test_torchscript(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         self._create_and_check_torchscript(config, inputs_dict)
 
     @slow
     def test_torchscript_output_attentions(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.output_attentions = True
         self._create_and_check_torchscript(config, inputs_dict)
 
     @slow
     def test_torchscript_output_hidden_state(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.output_hidden_states = True
         self._create_and_check_torchscript(config, inputs_dict)
 
@@ -696,13 +602,7 @@ def _create_and_check_torchscript(self, config, inputs_dict):
                     decoder_input_ids = inputs["decoder_input_ids"]
                     decoder_attention_mask = inputs["decoder_attention_mask"]
                     traced_model = torch.jit.trace(
-                        model,
-                        (
-                            input_ids,
-                            attention_mask,
-                            decoder_input_ids,
-                            decoder_attention_mask,
-                        ),
+                        model, (input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
                     )
                 else:
                     input_ids = inputs["input_ids"]
@@ -741,10 +641,7 @@ def _create_and_check_torchscript(self, config, inputs_dict):
                 key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
             }
 
-            self.assertEqual(
-                set(model_state_dict.keys()),
-                set(loaded_model_state_dict.keys()),
-            )
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
             model_buffers = list(model.buffers())
             for non_persistent_buffer in non_persistent_buffers.values():
@@ -767,17 +664,11 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             self.assertTrue(models_equal)
 
     def test_torch_fx(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         self._create_and_check_torch_fx_tracing(config, inputs_dict)
 
     def test_torch_fx_output_loss(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         self._create_and_check_torch_fx_tracing(config, inputs_dict, output_loss=True)
 
     def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
@@ -797,12 +688,7 @@ def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=Fa
                 if model.config.is_encoder_decoder:
                     model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
                     labels = inputs.get("labels", None)
-                    input_names = [
-                        "input_ids",
-                        "attention_mask",
-                        "decoder_input_ids",
-                        "decoder_attention_mask",
-                    ]
+                    input_names = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"]
                     if labels is not None:
                         input_names.append("labels")
                     filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
@@ -812,11 +698,7 @@ def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=Fa
                     traced_model = symbolic_trace(model, input_names)
                     traced_output = traced_model(**filtered_inputs)
                 else:
-                    input_names = [
-                        "input_ids",
-                        "attention_mask",
-                        "token_type_ids",
-                    ]
+                    input_names = ["input_ids", "attention_mask", "token_type_ids"]
                     input_ids = inputs["input_ids"]
 
                     labels = inputs.get("labels", None)
@@ -872,10 +754,7 @@ def test_headmasking(self):
             return
 
         global_rng.seed(42)
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         global_rng.seed()
 
         inputs_dict["output_attentions"] = True
@@ -970,10 +849,7 @@ def test_head_pruning(self):
 
             self.assertEqual(attentions[0].shape[-3], 1)
             self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(
-                attentions[-1].shape[-3],
-                self.model_tester.num_attention_heads - 1,
-            )
+            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
     def test_head_pruning_save_load_from_pretrained(self):
         if not self.test_pruning:
@@ -1009,10 +885,7 @@ def test_head_pruning_save_load_from_pretrained(self):
             attentions = outputs[-1]
             self.assertEqual(attentions[0].shape[-3], 1)
             self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(
-                attentions[-1].shape[-3],
-                self.model_tester.num_attention_heads - 1,
-            )
+            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
     def test_head_pruning_save_load_from_config_init(self):
         if not self.test_pruning:
@@ -1046,10 +919,7 @@ def test_head_pruning_save_load_from_config_init(self):
 
             self.assertEqual(attentions[0].shape[-3], 1)
             self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(
-                attentions[-1].shape[-3],
-                self.model_tester.num_attention_heads - 1,
-            )
+            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
     def test_head_pruning_integration(self):
         if not self.test_pruning:
@@ -1078,14 +948,8 @@ def test_head_pruning_integration(self):
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             attentions = outputs[-1]
 
-            self.assertEqual(
-                attentions[0].shape[-3],
-                self.model_tester.num_attention_heads - 1,
-            )
-            self.assertEqual(
-                attentions[1].shape[-3],
-                self.model_tester.num_attention_heads - 2,
-            )
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
             self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
             self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
 
@@ -1098,14 +962,8 @@ def test_head_pruning_integration(self):
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             attentions = outputs[-1]
 
-            self.assertEqual(
-                attentions[0].shape[-3],
-                self.model_tester.num_attention_heads - 1,
-            )
-            self.assertEqual(
-                attentions[1].shape[-3],
-                self.model_tester.num_attention_heads - 2,
-            )
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
             self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
             self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
 
@@ -1116,18 +974,9 @@ def test_head_pruning_integration(self):
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             attentions = outputs[-1]
 
-            self.assertEqual(
-                attentions[0].shape[-3],
-                self.model_tester.num_attention_heads - 1,
-            )
-            self.assertEqual(
-                attentions[1].shape[-3],
-                self.model_tester.num_attention_heads - 2,
-            )
-            self.assertEqual(
-                attentions[2].shape[-3],
-                self.model_tester.num_attention_heads - 2,
-            )
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
+            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
             self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
 
             self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
@@ -1144,9 +993,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
 
             expected_num_layers = getattr(
-                self.model_tester,
-                "expected_num_hidden_layers",
-                self.model_tester.num_hidden_layers + 1,
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
             )
             self.assertEqual(len(hidden_states), expected_num_layers)
 
@@ -1175,10 +1022,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
                     [decoder_seq_length, self.model_tester.hidden_size],
                 )
 
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             inputs_dict["output_hidden_states"] = True
@@ -1191,10 +1035,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             check_hidden_states_output(inputs_dict, config, model_class)
 
     def test_retain_grad_hidden_states_attentions(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.output_hidden_states = True
         config.output_attentions = True
 
@@ -1288,10 +1129,7 @@ def test_resize_position_vector_embeddings(self):
 
             # Retrieve the embeddings and clone theme
             if model.config.is_encoder_decoder:
-                (
-                    encoder_model_embed,
-                    decoder_model_embed,
-                ) = model.get_position_embeddings()
+                encoder_model_embed, decoder_model_embed = model.get_position_embeddings()
                 encoder_cloned_embeddings = encoder_model_embed.weight.clone()
                 decoder_cloned_embeddings = decoder_model_embed.weight.clone()
             else:
@@ -1301,25 +1139,13 @@ def test_resize_position_vector_embeddings(self):
             # Check that resizing the position embeddings with a larger max_position_embeddings increases
             # the model's postion embeddings size
             model.resize_position_embeddings(max_position_embeddings + 10)
-            self.assertEqual(
-                model.config.max_position_embeddings,
-                max_position_embeddings + 10,
-            )
+            self.assertEqual(model.config.max_position_embeddings, max_position_embeddings + 10)
 
             # Check that it actually resizes the embeddings matrix
             if model.config.is_encoder_decoder:
-                (
-                    encoder_model_embed,
-                    decoder_model_embed,
-                ) = model.get_position_embeddings()
-                self.assertEqual(
-                    encoder_model_embed.weight.shape[0],
-                    encoder_cloned_embeddings.shape[0] + 10,
-                )
-                self.assertEqual(
-                    decoder_model_embed.weight.shape[0],
-                    decoder_cloned_embeddings.shape[0] + 10,
-                )
+                encoder_model_embed, decoder_model_embed = model.get_position_embeddings()
+                self.assertEqual(encoder_model_embed.weight.shape[0], encoder_cloned_embeddings.shape[0] + 10)
+                self.assertEqual(decoder_model_embed.weight.shape[0], decoder_cloned_embeddings.shape[0] + 10)
             else:
                 model_embed = model.get_position_embeddings()
                 self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
@@ -1330,25 +1156,13 @@ def test_resize_position_vector_embeddings(self):
             # Check that resizing the position embeddings with a smaller max_position_embeddings decreases
             # the model's max_position_embeddings
             model.resize_position_embeddings(max_position_embeddings - 5)
-            self.assertEqual(
-                model.config.max_position_embeddings,
-                max_position_embeddings - 5,
-            )
+            self.assertEqual(model.config.max_position_embeddings, max_position_embeddings - 5)
 
             # Check that it actually resizes the embeddings matrix
             if model.config.is_encoder_decoder:
-                (
-                    encoder_model_embed,
-                    decoder_model_embed,
-                ) = model.get_position_embeddings()
-                self.assertEqual(
-                    encoder_model_embed.weight.shape[0],
-                    encoder_cloned_embeddings.shape[0] - 5,
-                )
-                self.assertEqual(
-                    decoder_model_embed.weight.shape[0],
-                    decoder_cloned_embeddings.shape[0] - 5,
-                )
+                encoder_model_embed, decoder_model_embed = model.get_position_embeddings()
+                self.assertEqual(encoder_model_embed.weight.shape[0], encoder_cloned_embeddings.shape[0] - 5)
+                self.assertEqual(decoder_model_embed.weight.shape[0], decoder_cloned_embeddings.shape[0] - 5)
             else:
                 model_embed = model.get_position_embeddings()
                 self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 5)
@@ -1477,10 +1291,7 @@ def test_resize_embeddings_untied(self):
             model(**self._prepare_for_class(inputs_dict, model_class))
 
     def test_model_common_attributes(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -1516,10 +1327,7 @@ def test_tie_model_weights(self):
         if not self.test_torchscript:
             return
 
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_same_values(layer_1, layer_2):
             equal = True
@@ -1564,10 +1372,7 @@ def check_same_values(layer_1, layer_2):
             # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
 
     def test_model_outputs_equivalence(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def set_nan_tensor_to_zero(t):
             t[t != t] = 0
@@ -1592,9 +1397,7 @@ def recursive_check(tuple_object, dict_object):
                     else:
                         self.assertTrue(
                             torch.allclose(
-                                set_nan_tensor_to_zero(tuple_object),
-                                set_nan_tensor_to_zero(dict_object),
-                                atol=1e-5,
+                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
                             ),
                             msg=f"Tuple and dict output are not equal. Difference: {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`: {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}.",
                         )
@@ -1633,10 +1436,7 @@ def recursive_check(tuple_object, dict_object):
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             check_equivalence(
-                model,
-                tuple_inputs,
-                dict_inputs,
-                {"output_hidden_states": True, "output_attentions": True},
+                model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
             )
 
     @is_pt_tf_cross_test
@@ -1646,10 +1446,7 @@ def test_pt_tf_model_equivalence(self):
 
         import transformers
 
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
@@ -1769,18 +1566,11 @@ def test_pt_tf_model_equivalence(self):
 
     def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
         diff = np.abs((a - b)).max()
-        self.assertLessEqual(
-            diff,
-            tol,
-            f"Difference between torch and flax is {diff} (>= {tol}).",
-        )
+        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
 
     @is_pt_flax_cross_test
     def test_equivalence_pt_to_flax(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             with self.subTest(model_class.__name__):
@@ -1818,11 +1608,7 @@ def test_equivalence_pt_to_flax(self):
                 # convert inputs to Flax
                 fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
                 fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs),
-                    len(pt_outputs),
-                    "Output lengths differ between Flax and PyTorch",
-                )
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
                 for fx_output, pt_output in zip(fx_outputs, pt_outputs):
                     self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
 
@@ -1832,19 +1618,14 @@ def test_equivalence_pt_to_flax(self):
 
                 fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple()
                 self.assertEqual(
-                    len(fx_outputs_loaded),
-                    len(pt_outputs),
-                    "Output lengths differ between Flax and PyTorch",
+                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
                 )
                 for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
                     self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2)
 
     @is_pt_flax_cross_test
     def test_equivalence_flax_to_pt(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             with self.subTest(model_class.__name__):
@@ -1884,11 +1665,7 @@ def test_equivalence_flax_to_pt(self):
                 fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
 
                 fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs),
-                    len(pt_outputs),
-                    "Output lengths differ between Flax and PyTorch",
-                )
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
 
                 for fx_output, pt_output in zip(fx_outputs, pt_outputs):
                     self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
@@ -1901,18 +1678,13 @@ def test_equivalence_flax_to_pt(self):
                     pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
 
                 self.assertEqual(
-                    len(fx_outputs),
-                    len(pt_outputs_loaded),
-                    "Output lengths differ between Flax and PyTorch",
+                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
                 )
                 for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
                     self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
 
     def test_inputs_embeds(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -1942,18 +1714,11 @@ def test_inputs_embeds(self):
 
     @require_torch_multi_gpu
     def test_multi_gpu_data_parallel_forward(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         # some params shouldn't be scattered by nn.DataParallel
         # so just remove them if they are present.
-        blacklist_non_batched_params = [
-            "head_mask",
-            "decoder_head_mask",
-            "cross_attn_head_mask",
-        ]
+        blacklist_non_batched_params = ["head_mask", "decoder_head_mask", "cross_attn_head_mask"]
         for k in blacklist_non_batched_params:
             inputs_dict.pop(k, None)
 
@@ -2039,10 +1804,7 @@ def test_model_parallel_equal_results(self):
         if not self.test_model_parallel:
             return
 
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_parallelizable_model_classes:
             inputs_dict = self._prepare_for_class(inputs_dict, model_class)
@@ -2080,10 +1842,7 @@ def test_model_parallel_beam_search(self):
             set(self.all_generative_model_classes).intersection(self.all_parallelizable_model_classes)
         )
 
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in all_generative_and_parallelizable_model_classes:
             inputs_dict = self._prepare_for_class(inputs_dict, model_class)
@@ -2103,22 +1862,11 @@ def cast_to_device(dictionary, device):
             model.generate(**cast_to_device(inputs_dict, "cuda:0"), num_beams=2)
 
     def test_problem_types(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         problem_types = [
-            {
-                "title": "multi_label_classification",
-                "num_labels": 2,
-                "dtype": torch.float,
-            },
-            {
-                "title": "single_label_classification",
-                "num_labels": 1,
-                "dtype": torch.long,
-            },
+            {"title": "multi_label_classification", "num_labels": 2, "dtype": torch.float},
+            {"title": "single_label_classification", "num_labels": 1, "dtype": torch.long},
             {"title": "regression", "num_labels": 1, "dtype": torch.float},
         ]
 
@@ -2160,10 +1908,7 @@ def test_problem_types(self):
     def test_load_with_mismatched_shapes(self):
         if not self.test_mismatched_shapes:
             return
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
@@ -2184,9 +1929,7 @@ def test_load_with_mismatched_shapes(self):
 
                     with CaptureLogger(logger) as cl:
                         new_model = AutoModelForSequenceClassification.from_pretrained(
-                            tmp_dir,
-                            num_labels=42,
-                            ignore_mismatched_sizes=True,
+                            tmp_dir, num_labels=42, ignore_mismatched_sizes=True
                         )
                     self.assertIn("the shapes did not match", cl.out)
                     new_model.to(torch_device)
@@ -2380,11 +2123,7 @@ def tearDownClass(cls):
             pass
 
         try:
-            delete_repo(
-                token=cls._token,
-                name="test-model-org",
-                organization="valid_org",
-            )
+            delete_repo(token=cls._token, name="test-model-org", organization="valid_org")
         except HTTPError:
             pass
 
@@ -2400,19 +2139,11 @@ def tearDownClass(cls):
 
     def test_push_to_hub(self):
         config = BertConfig(
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
         )
         model = BertModel(config)
         with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                os.path.join(tmp_dir, "test-model"),
-                push_to_hub=True,
-                use_auth_token=self._token,
-            )
+            model.save_pretrained(os.path.join(tmp_dir, "test-model"), push_to_hub=True, use_auth_token=self._token)
 
             new_model = BertModel.from_pretrained(f"{USER}/test-model")
             for p1, p2 in zip(model.parameters(), new_model.parameters()):
@@ -2420,11 +2151,7 @@ def test_push_to_hub(self):
 
     def test_push_to_hub_in_organization(self):
         config = BertConfig(
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
         )
         model = BertModel(config)
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -2447,19 +2174,12 @@ def test_push_to_hub_dynamic_model(self):
         model = CustomModel(config)
 
         with tempfile.TemporaryDirectory() as tmp_dir:
-            repo = Repository(
-                tmp_dir,
-                clone_from=f"{USER}/test-dynamic-model",
-                use_auth_token=self._token,
-            )
+            repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-model", use_auth_token=self._token)
             model.save_pretrained(tmp_dir)
             # checks
             self.assertDictEqual(
                 config.auto_map,
-                {
-                    "AutoConfig": "custom_configuration.CustomConfig",
-                    "AutoModel": "custom_modeling.CustomModel",
-                },
+                {"AutoConfig": "custom_configuration.CustomConfig", "AutoModel": "custom_modeling.CustomModel"},
             )
 
             repo.push_to_hub()

From 0f8069d656ce99b9c6d05ef5c058e82fdd6c71f2 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 25 Feb 2022 07:30:33 +0530
Subject: [PATCH 58/65] chore: revert to previous states for
 test_modeling_tf_common.py and modeling_tf_utils.py

---
 src/transformers/modeling_tf_utils.py | 125 ++------
 tests/test_modeling_tf_common.py      | 395 +++++---------------------
 2 files changed, 97 insertions(+), 423 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 4637130e7771c..8d2ad8d10c081 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -312,10 +312,9 @@ def booleans_processing(config, **kwargs):
 
     if tf.executing_eagerly():
         # Pure conv models (such as ConvNext) do not have `output_attentions`
-        final_booleans["output_attentions"] = kwargs.get("output_attentions", None)
-        if final_booleans["output_attentions"] is None:
-            final_booleans["output_attentions"] = config.output_attentions
-
+        final_booleans["output_attentions"] = (
+            kwargs["output_attentions"] if kwargs["output_attentions"] is not None else config.output_attentions
+        )
         final_booleans["output_hidden_states"] = (
             kwargs["output_hidden_states"]
             if kwargs["output_hidden_states"] is not None
@@ -367,17 +366,7 @@ def input_processing(func, config, input_ids, **kwargs):
     signature.pop("self", None)
     parameter_names = list(signature.keys())
     output = {}
-    allowed_types = (
-        tf.Tensor,
-        bool,
-        int,
-        ModelOutput,
-        tuple,
-        list,
-        dict,
-        np.ndarray,
-        KerasTensor,
-    )
+    allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray, KerasTensor)
 
     if "inputs" in kwargs["kwargs_call"]:
         warnings.warn(
@@ -490,13 +479,7 @@ def input_processing(func, config, input_ids, **kwargs):
     boolean_dict = {
         k: v
         for k, v in output.items()
-        if k
-        in [
-            "return_dict",
-            "output_attentions",
-            "output_hidden_states",
-            "use_cache",
-        ]
+        if k in ["return_dict", "output_attentions", "output_hidden_states", "use_cache"]
     }
 
     output.update(
@@ -595,18 +578,11 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False,
                             # If yes we reshape the weight from the H5 file accordingly to the current weight
                             # If the two shapes are not compatible we raise an issue
                             try:
-                                array = np.reshape(
-                                    saved_weight_value,
-                                    K.int_shape(symbolic_weight),
-                                )
+                                array = np.reshape(saved_weight_value, K.int_shape(symbolic_weight))
                             except ValueError as e:
                                 if ignore_mismatched_sizes:
                                     mismatched_layers.append(
-                                        (
-                                            symbolic_weight_name,
-                                            saved_weight_value.shape,
-                                            K.int_shape(symbolic_weight),
-                                        )
+                                        (symbolic_weight_name, saved_weight_value.shape, K.int_shape(symbolic_weight))
                                     )
                                     continue
                                 else:
@@ -650,17 +626,11 @@ def init_copy_embeddings(old_embeddings, new_num_tokens):
         # and we create a mask to properly identify the padded values and be replaced by the values of the newly created
         # embeddings
         current_weights = tf.pad(
-            old_embeddings.value(),
-            tf.convert_to_tensor([[0, size_diff], [0, 0]]),
-            constant_values=-1,
+            old_embeddings.value(), tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=-1
         )
         num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
         mask = tf.fill(tf.convert_to_tensor([num_tokens_to_copy, 1]), True)
-        mask = tf.pad(
-            mask,
-            tf.convert_to_tensor([[0, size_diff], [0, 0]]),
-            constant_values=False,
-        )
+        mask = tf.pad(mask, tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=False)
     else:
         # if the new size if lower than the old one, we take the current embeddings until the new size
         current_weights = tf.slice(
@@ -805,10 +775,7 @@ def _save_checkpoint(self, checkpoint_dir, epoch):
         # internally and which users are likely to use too
         weights_path = os.path.join(checkpoint_dir, "weights.h5")
         self.save_weights(weights_path)
-        extra_data = {
-            "epoch": epoch,
-            "optimizer_state": self.optimizer.get_weights(),
-        }
+        extra_data = {"epoch": epoch, "optimizer_state": self.optimizer.get_weights()}
         extra_data_path = os.path.join(checkpoint_dir, "extra_data.pickle")
         with open(extra_data_path, "wb") as f:
             pickle.dump(extra_data, f)
@@ -834,10 +801,7 @@ def load_repo_checkpoint(self, repo_path_or_name):
         if not os.path.isdir(repo_path_or_name):
             # If this isn't a local path, check that the remote repo exists and has a checkpoint in it
             repo_files = list_repo_files(repo_path_or_name)
-            for file in (
-                "checkpoint/weights.h5",
-                "checkpoint/extra_data.pickle",
-            ):
+            for file in ("checkpoint/weights.h5", "checkpoint/extra_data.pickle"):
                 if file not in repo_files:
                     raise FileNotFoundError(f"Repo {repo_path_or_name} does not contain checkpoint file {file}!")
             if "/" not in repo_path_or_name:
@@ -845,10 +809,7 @@ def load_repo_checkpoint(self, repo_path_or_name):
                 repo_path_or_name = self.get_full_repo_name(repo_path_or_name)
             else:
                 model_id = repo_path_or_name.split("/")[-1]
-            repo = Repository(
-                model_id,
-                clone_from=f"https://huggingface.co/{repo_path_or_name}",
-            )
+            repo = Repository(model_id, clone_from=f"https://huggingface.co/{repo_path_or_name}")
             local_dir = repo.local_dir
         else:
             local_dir = repo_path_or_name
@@ -1105,8 +1066,7 @@ def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]:
             `tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
         """
         warnings.warn(
-            "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.",
-            FutureWarning,
+            "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning
         )
         return self.get_lm_head()
 
@@ -1117,10 +1077,7 @@ def get_prefix_bias_name(self) -> Union[None, str]:
         Return:
             `str`: The _prefix name of the bias.
         """
-        warnings.warn(
-            "The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.",
-            FutureWarning,
-        )
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
         return None
 
     def get_bias(self) -> Union[None, Dict[str, tf.Variable]]:
@@ -1268,25 +1225,15 @@ def _get_resized_lm_head_bias(self, old_lm_head_bias, new_num_tokens):
             # initialize new bias
             if tf.math.greater(size_diff, 0):
                 padding_shape = [[0, size_diff]] if first_dim is None else [[0, 0], [0, size_diff]]
-                current_bias = tf.pad(
-                    weight.value(),
-                    tf.convert_to_tensor(padding_shape),
-                    constant_values=-1,
-                )
+                current_bias = tf.pad(weight.value(), tf.convert_to_tensor(padding_shape), constant_values=-1)
                 num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
                 mask_shape = [num_tokens_to_copy] if first_dim is None else [1, num_tokens_to_copy]
                 bias_mask = tf.fill(tf.convert_to_tensor(mask_shape), True)
-                bias_mask = tf.pad(
-                    bias_mask,
-                    tf.convert_to_tensor(padding_shape),
-                    constant_values=False,
-                )
+                bias_mask = tf.pad(bias_mask, tf.convert_to_tensor(padding_shape), constant_values=False)
             else:
                 slice_from = [0] if first_dim is None else [0, 0]
                 current_bias = tf.slice(
-                    weight.value(),
-                    tf.convert_to_tensor(slice_from),
-                    tf.convert_to_tensor(final_shape),
+                    weight.value(), tf.convert_to_tensor(slice_from), tf.convert_to_tensor(final_shape)
                 )
                 bias_mask = tf.fill(tf.convert_to_tensor(final_shape), True)
 
@@ -1427,11 +1374,7 @@ def save_pretrained(self, save_directory, saved_model=False, version=1, push_to_
 
         if saved_model:
             saved_model_dir = os.path.join(save_directory, "saved_model", str(version))
-            self.save(
-                saved_model_dir,
-                include_optimizer=False,
-                signatures=self.serving,
-            )
+            self.save(saved_model_dir, include_optimizer=False, signatures=self.serving)
             logger.info(f"Saved model created in {saved_model_dir}")
 
         # Save configuration file
@@ -1583,11 +1526,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
 
-        user_agent = {
-            "file_type": "model",
-            "framework": "tensorflow",
-            "from_auto_class": from_auto_class,
-        }
+        user_agent = {"file_type": "model", "framework": "tensorflow", "from_auto_class": from_auto_class}
         if from_pipeline is not None:
             user_agent["using_pipeline"] = from_pipeline
 
@@ -1683,11 +1622,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                         "proxies": proxies,
                         "use_auth_token": use_auth_token,
                     }
-                    if has_file(
-                        pretrained_model_name_or_path,
-                        WEIGHTS_NAME,
-                        **has_file_kwargs,
-                    ):
+                    if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs):
                         raise EnvironmentError(
                             f"{pretrained_model_name_or_path} does not appear to have a file named {TF2_WEIGHTS_NAME} "
                             "but there is a file for PyTorch weights. Use `from_pt=True` to load this model from "
@@ -1837,9 +1772,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 # To update the docstring, we need to copy the method, otherwise we change the original docstring.
 TFPreTrainedModel.push_to_hub = copy_func(TFPreTrainedModel.push_to_hub)
 TFPreTrainedModel.push_to_hub.__doc__ = TFPreTrainedModel.push_to_hub.__doc__.format(
-    object="model",
-    object_class="TFAutoModel",
-    object_files="model checkpoint",
+    object="model", object_class="TFAutoModel", object_files="model checkpoint"
 )
 
 
@@ -1868,9 +1801,7 @@ def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
 
     def build(self, input_shape):
         self.weight = self.add_weight(
-            "weight",
-            shape=[self.nx, self.nf],
-            initializer=get_initializer(self.initializer_range),
+            "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range)
         )
         self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer())
 
@@ -1916,9 +1847,7 @@ def build(self, input_shape):
         https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
         """
         self.weight = self.add_weight(
-            "weight",
-            shape=[self.vocab_size, self.hidden_size],
-            initializer=get_initializer(self.initializer_range),
+            "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
         )
         super().build(input_shape)
 
@@ -2032,9 +1961,7 @@ def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **
             else:
                 num_classes = config.hidden_size
             self.summary = tf.keras.layers.Dense(
-                num_classes,
-                kernel_initializer=get_initializer(initializer_range),
-                name="summary",
+                num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
             )
 
         self.has_activation = False
@@ -2129,9 +2056,7 @@ def register_for_auto_class(cls, auto_class="TFAutoModel"):
         cls._auto_class = auto_class
 
 
-def get_initializer(
-    initializer_range: float = 0.02,
-) -> tf.initializers.TruncatedNormal:
+def get_initializer(initializer_range: float = 0.02) -> tf.initializers.TruncatedNormal:
     """
     Creates a `tf.initializers.TruncatedNormal` with the given range.
 
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 2038f29e56cf8..142bff7cae06e 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -83,8 +83,7 @@
             # Restrict TensorFlow to only allocate x GB of memory on the GPUs
             try:
                 tf.config.set_logical_device_configuration(
-                    gpu,
-                    [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)],
+                    gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)]
                 )
                 logical_gpus = tf.config.list_logical_devices("GPU")
                 print("Logical GPUs", logical_gpus)
@@ -117,10 +116,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d
 
         if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
             inputs_dict = {
-                k: tf.tile(
-                    tf.expand_dims(v, 1),
-                    (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1),
-                )
+                k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
                 if isinstance(v, tf.Tensor) and v.ndim > 0
                 else v
                 for k, v in inputs_dict.items()
@@ -148,11 +144,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d
                 *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING),
             ]:
                 inputs_dict["labels"] = tf.zeros(
-                    (
-                        self.model_tester.batch_size,
-                        self.model_tester.seq_length,
-                    ),
-                    dtype=tf.int32,
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
                 )
         return inputs_dict
 
@@ -160,10 +152,7 @@ def test_initialization(self):
         pass
 
     def test_save_load(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -177,10 +166,7 @@ def test_save_load(self):
                 self.assert_outputs_same(after_outputs, outputs)
 
     def test_save_load_config(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -232,10 +218,7 @@ def test_onnx_compliancy(self):
         if not self.test_onnx:
             return
 
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         INTERNAL_OPS = [
             "Assert",
             "AssignVariableOp",
@@ -282,10 +265,7 @@ def test_onnx_runtime_optimize(self):
         import onnxruntime
         import tf2onnx
 
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -296,10 +276,7 @@ def test_onnx_runtime_optimize(self):
             onnxruntime.InferenceSession(onnx_model_proto.SerializeToString())
 
     def test_keras_save_load(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         tf_main_layer_classes = set(
             module_member
@@ -344,8 +321,7 @@ def test_keras_save_load(self):
                     )
                 else:
                     model = tf.keras.models.load_model(
-                        filepath,
-                        custom_objects={main_layer_class.__name__: main_layer_class},
+                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
                     )
                 assert isinstance(model, tf.keras.Model)
                 after_outputs = model(inputs_dict)
@@ -372,10 +348,7 @@ def test_pt_tf_model_equivalence(self):
 
         import transformers
 
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
@@ -388,9 +361,7 @@ def test_pt_tf_model_equivalence(self):
 
             # Check we can load pt model in tf and vice-versa with model => model functions
             tf_model = transformers.load_pytorch_model_in_tf2_model(
-                tf_model,
-                pt_model,
-                tf_inputs=self._prepare_for_class(inputs_dict, model_class),
+                tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class)
             )
             pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
 
@@ -411,10 +382,7 @@ def test_pt_tf_model_equivalence(self):
 
             with torch.no_grad():
                 pto = pt_model(**pt_inputs_dict)
-            tfo = tf_model(
-                self._prepare_for_class(inputs_dict, model_class),
-                training=False,
-            )
+            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False)
 
             tf_hidden_states = tfo[0].numpy()
             pt_hidden_states = pto[0].numpy()
@@ -473,20 +441,14 @@ def test_pt_tf_model_equivalence(self):
             self.assertLessEqual(max_diff, 4e-2)
 
     def test_compile_tf_model(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         max_input = getattr(self.model_tester, "max_position_embeddings", 512)
         optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
         loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
         metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
 
         for model_class in self.all_model_classes:
-            if model_class.__name__ in [
-                "TFSpeech2TextModel",
-                "TFSpeech2TextForConditionalGeneration",
-            ]:
+            if model_class.__name__ in ["TFSpeech2TextModel", "TFSpeech2TextForConditionalGeneration"]:
                 inputs = {
                     "decoder_input_ids": tf.keras.Input(
                         batch_shape=(2, max_input),
@@ -510,11 +472,7 @@ def test_compile_tf_model(self):
                         name="decoder_input_ids",
                         dtype="int32",
                     ),
-                    "input_ids": tf.keras.Input(
-                        batch_shape=(2, max_input),
-                        name="input_ids",
-                        dtype="int32",
-                    ),
+                    "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
                 }
             # `pixel_values` implies that the input is an image
             elif model_class.main_input_name == "pixel_values":
@@ -530,11 +488,7 @@ def test_compile_tf_model(self):
                 )
             elif model_class.__name__ in ["TFCLIPModel"]:
                 inputs = {
-                    "input_ids": tf.keras.Input(
-                        batch_shape=(3, max_input),
-                        name="input_ids",
-                        dtype="int32",
-                    ),
+                    "input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"),
                     "pixel_values": tf.keras.Input(
                         batch_shape=(
                             3,
@@ -547,11 +501,7 @@ def test_compile_tf_model(self):
                     ),
                 }
             elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs = tf.keras.Input(
-                    batch_shape=(4, 2, max_input),
-                    name="input_ids",
-                    dtype="int32",
-                )
+                inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32")
             else:
                 inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
 
@@ -574,10 +524,7 @@ def test_compile_tf_model(self):
             extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 
     def test_keyword_and_dict_args(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -593,21 +540,10 @@ def test_keyword_and_dict_args(self):
             self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
 
     def test_attention_outputs(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
-        decoder_seq_length = getattr(
-            self.model_tester,
-            "decoder_seq_length",
-            self.model_tester.seq_length,
-        )
-        encoder_seq_length = getattr(
-            self.model_tester,
-            "encoder_seq_length",
-            self.model_tester.seq_length,
-        )
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
         decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
         encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
 
@@ -618,11 +554,7 @@ def check_decoder_attentions_output(outputs):
             self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
             self.assertListEqual(
                 list(decoder_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    decoder_seq_length,
-                    decoder_key_length,
-                ],
+                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
             )
 
         def check_encoder_attentions_output(outputs):
@@ -632,11 +564,7 @@ def check_encoder_attentions_output(outputs):
             self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
             self.assertListEqual(
                 list(attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    encoder_seq_length,
-                    encoder_key_length,
-                ],
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
             )
 
         for model_class in self.all_model_classes:
@@ -678,10 +606,7 @@ def test_headmasking(self):
             return
 
         random.Random().seed(42)
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         random.Random().seed()
 
         inputs_dict["output_attentions"] = True
@@ -694,19 +619,11 @@ def test_headmasking(self):
             def prepare_layer_head_mask(i, attention_heads, num_hidden_layers):
                 if i == 0:
                     return tf.concat(
-                        (
-                            tf.zeros(1, dtype=tf.float32),
-                            tf.ones(attention_heads - 1, dtype=tf.float32),
-                        ),
-                        0,
+                        (tf.zeros(1, dtype=tf.float32), tf.ones(attention_heads - 1, dtype=tf.float32)), 0
                     )
                 elif i == num_hidden_layers - 1:
                     return tf.concat(
-                        (
-                            tf.zeros(attention_heads - 1, dtype=tf.float32),
-                            tf.ones(1, dtype=tf.float32),
-                        ),
-                        0,
+                        (tf.zeros(attention_heads - 1, dtype=tf.float32), tf.ones(1, dtype=tf.float32)), 0
                     )
                 else:
                     return tf.ones(attention_heads, dtype=tf.float32)
@@ -735,8 +652,7 @@ def check_attentions_validity(attentions):
                 # Remove Nan
                 for t in attentions:
                     self.assertLess(
-                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(),
-                        (tf.size(t) / 4).numpy(),
+                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), (tf.size(t) / 4).numpy()
                     )  # Check we don't have more than 25% nans (arbitrary)
 
                 attentions = [
@@ -744,23 +660,11 @@ def check_attentions_validity(attentions):
                 ]  # remove them (the test is less complete)
 
                 self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0)
-                self.assertNotEqual(
-                    tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(),
-                    0.0,
-                )
+                self.assertNotEqual(tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), 0.0)
                 if len(attentions) > 2:  # encoder-decodere models have only 2 layers in each modules
-                    self.assertNotEqual(
-                        tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(),
-                        0.0,
-                    )
-                self.assertAlmostEqual(
-                    tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(),
-                    0.0,
-                )
-                self.assertNotEqual(
-                    tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(),
-                    0.0,
-                )
+                    self.assertNotEqual(tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), 0.0)
+                self.assertAlmostEqual(tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), 0.0)
+                self.assertNotEqual(tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), 0.0)
 
             if model.config.is_encoder_decoder:
                 check_attentions_validity(outputs.encoder_attentions)
@@ -771,18 +675,13 @@ def check_attentions_validity(attentions):
                 check_attentions_validity(outputs.attentions)
 
     def test_hidden_states_output(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_hidden_states_output(config, inputs_dict, model_class):
             model = model_class(config)
             outputs = model(self._prepare_for_class(inputs_dict, model_class))
             expected_num_layers = getattr(
-                self.model_tester,
-                "expected_num_hidden_layers",
-                self.model_tester.num_hidden_layers + 1,
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
             )
 
             if model.config.is_encoder_decoder:
@@ -793,18 +692,12 @@ def check_hidden_states_output(config, inputs_dict, model_class):
                 self.assertEqual(len(encoder_hidden_states), expected_num_layers)
                 self.assertListEqual(
                     list(encoder_hidden_states[0].shape[-2:]),
-                    [
-                        self.model_tester.seq_length,
-                        self.model_tester.hidden_size,
-                    ],
+                    [self.model_tester.seq_length, self.model_tester.hidden_size],
                 )
                 self.assertEqual(len(decoder_hidden_states), expected_num_layers)
                 self.assertListEqual(
                     list(decoder_hidden_states[0].shape[-2:]),
-                    [
-                        self.model_tester.seq_length,
-                        self.model_tester.hidden_size,
-                    ],
+                    [self.model_tester.seq_length, self.model_tester.hidden_size],
                 )
             else:
                 hidden_states = outputs.hidden_states
@@ -812,10 +705,7 @@ def check_hidden_states_output(config, inputs_dict, model_class):
                 self.assertEqual(len(hidden_states), expected_num_layers)
                 self.assertListEqual(
                     list(hidden_states[0].shape[-2:]),
-                    [
-                        self.model_tester.seq_length,
-                        self.model_tester.hidden_size,
-                    ],
+                    [self.model_tester.seq_length, self.model_tester.hidden_size],
                 )
 
         for model_class in self.all_model_classes:
@@ -827,10 +717,7 @@ def check_hidden_states_output(config, inputs_dict, model_class):
             check_hidden_states_output(config, inputs_dict, model_class)
 
     def test_model_common_attributes(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         text_in_text_out_models = (
             get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
             + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
@@ -860,22 +747,13 @@ def test_model_common_attributes(self):
                 assert name is None
 
     def test_determinism(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
             first, second = (
-                model(
-                    self._prepare_for_class(inputs_dict, model_class),
-                    training=False,
-                )[0],
-                model(
-                    self._prepare_for_class(inputs_dict, model_class),
-                    training=False,
-                )[0],
+                model(self._prepare_for_class(inputs_dict, model_class), training=False)[0],
+                model(self._prepare_for_class(inputs_dict, model_class), training=False)[0],
             )
             out_1 = first.numpy()
             out_2 = second.numpy()
@@ -886,10 +764,7 @@ def test_determinism(self):
 
     def test_model_outputs_equivalence(self):
 
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
             tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
@@ -939,17 +814,11 @@ def recursive_check(tuple_object, dict_object):
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             check_equivalence(
-                model,
-                tuple_inputs,
-                dict_inputs,
-                {"output_hidden_states": True, "output_attentions": True},
+                model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
             )
 
     def test_inputs_embeds(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -976,10 +845,7 @@ def test_inputs_embeds(self):
             model(inputs)
 
     def test_numpy_arrays_inputs(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def prepare_numpy_arrays(inputs_dict):
             inputs_np_dict = {}
@@ -1004,10 +870,7 @@ def prepare_numpy_arrays(inputs_dict):
     def test_resize_token_embeddings(self):
         if not self.test_resize_embeddings:
             return
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def _get_word_embedding_weight(model, embedding_layer):
             embeds = getattr(embedding_layer, "weight", None)
@@ -1066,25 +929,16 @@ def _get_word_embedding_weight(model, embedding_layer):
 
                 if old_output_embeddings is not None and new_output_embeddings is not None:
                     self.assertEqual(new_output_embeddings.shape[0], assert_size)
-                    self.assertEqual(
-                        new_output_embeddings.shape[1],
-                        old_output_embeddings.shape[1],
-                    )
+                    self.assertEqual(new_output_embeddings.shape[1], old_output_embeddings.shape[1])
 
                     models_equal = True
-                    for p1, p2 in zip(
-                        old_output_embeddings.value(),
-                        new_output_embeddings.value(),
-                    ):
+                    for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
                         if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
                             models_equal = False
                     self.assertTrue(models_equal)
 
     def test_lm_head_model_random_no_beam_search_generate(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
 
         # iterate over all generative models
@@ -1111,25 +965,16 @@ def test_lm_head_model_random_no_beam_search_generate(self):
 
             # check bad words tokens language generation
             # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [
-                self._generate_random_bad_tokens(1, model),
-                self._generate_random_bad_tokens(2, model),
-            ]
+            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
             output_tokens = model.generate(
-                input_ids,
-                do_sample=True,
-                bad_words_ids=bad_words_ids,
-                num_return_sequences=2,
+                input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
             )
             # only count generated tokens
             generated_ids = output_tokens[:, input_ids.shape[-1] :]
             self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
 
     def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
         if input_ids is None:
             input_ids = inputs_dict.get("input_features", None)
@@ -1162,10 +1007,7 @@ def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
                 self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput)
 
     def test_lm_head_model_random_beam_search_generate(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
 
         for model_class in self.all_generative_model_classes:
@@ -1180,12 +1022,7 @@ def test_lm_head_model_random_beam_search_generate(self):
 
             with self.assertRaises(AssertionError):
                 # generating more sequences than having beams leads is not possible
-                model.generate(
-                    input_ids,
-                    do_sample=False,
-                    num_return_sequences=3,
-                    num_beams=2,
-                )
+                model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
 
             # num_return_sequences > 1, sample
             self._check_generated_ids(
@@ -1197,37 +1034,20 @@ def test_lm_head_model_random_beam_search_generate(self):
                 )
             )
             # num_return_sequences > 1, greedy
-            self._check_generated_ids(
-                model.generate(
-                    input_ids,
-                    do_sample=False,
-                    num_beams=2,
-                    num_return_sequences=2,
-                )
-            )
+            self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2))
 
             # check bad words tokens language generation
             # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [
-                self._generate_random_bad_tokens(1, model),
-                self._generate_random_bad_tokens(2, model),
-            ]
+            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
             output_tokens = model.generate(
-                input_ids,
-                do_sample=False,
-                bad_words_ids=bad_words_ids,
-                num_beams=2,
-                num_return_sequences=2,
+                input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
             )
             # only count generated tokens
             generated_ids = output_tokens[:, input_ids.shape[-1] :]
             self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
 
     def test_lm_head_model_beam_search_generate_dict_outputs(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
         if input_ids is None:
             input_ids = inputs_dict.get("input_features", None)
@@ -1262,20 +1082,14 @@ def test_lm_head_model_beam_search_generate_dict_outputs(self):
                 self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput)
 
     def test_loss_computation(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config)
             if getattr(model, "hf_compute_loss", None):
                 # The number of elements in the loss should be the same as the number of elements in the label
                 prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
                 added_label = prepared_for_class[
-                    sorted(
-                        list(prepared_for_class.keys() - inputs_dict.keys()),
-                        reverse=True,
-                    )[0]
+                    sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]
                 ]
                 loss_size = tf.size(added_label)
 
@@ -1286,11 +1100,7 @@ def test_loss_computation(self):
 
                 # Test that model correctly compute the loss with kwargs
                 prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                possible_input_names = {
-                    "input_ids",
-                    "pixel_values",
-                    "input_features",
-                }
+                possible_input_names = {"input_ids", "pixel_values", "input_features"}
                 input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
                 model_input = prepared_for_class.pop(input_name)
 
@@ -1334,15 +1144,8 @@ def test_loss_computation(self):
                 self.assertEqual(loss.shape, [loss_size])
 
     def test_generate_with_headmasking(self):
-        attention_names = [
-            "encoder_attentions",
-            "decoder_attentions",
-            "cross_attentions",
-        ]
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_generative_model_classes:
             model = model_class(config)
@@ -1377,10 +1180,7 @@ def test_generate_with_headmasking(self):
     def test_load_with_mismatched_shapes(self):
         if not self.test_mismatched_shapes:
             return
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
@@ -1487,13 +1287,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
 def random_attention_mask(shape, rng=None, name=None, dtype=None):
     attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype)
     # make sure that at least one token is attended to for each batch
-    attn_mask = tf.concat(
-        [
-            tf.constant(value=1, shape=(shape[0], 1), dtype=dtype),
-            attn_mask[:, 1:],
-        ],
-        axis=1,
-    )
+    attn_mask = tf.concat([tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), attn_mask[:, 1:]], axis=1)
     return attn_mask
 
 
@@ -1510,10 +1304,7 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
     for _ in range(total_dims):
         values.append(rng.random() * scale)
 
-    return tf.reshape(
-        tf.constant(values, dtype=dtype if dtype is not None else tf.float32),
-        shape=shape,
-    )
+    return tf.reshape(tf.constant(values, dtype=dtype if dtype is not None else tf.float32), shape=shape)
 
 
 @require_tf
@@ -1592,34 +1383,12 @@ def test_top_k_top_p_filtering(self):
         )
 
         non_inf_expected_idx = tf.convert_to_tensor(
-            [
-                [0, 0],
-                [0, 9],
-                [0, 10],
-                [0, 25],
-                [0, 26],
-                [1, 13],
-                [1, 17],
-                [1, 18],
-                [1, 20],
-                [1, 27],
-            ],
+            [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]],
             dtype=tf.int32,
         )  # expected non filtered idx as noted above
 
         non_inf_expected_output = tf.convert_to_tensor(
-            [
-                8.222099,
-                7.3534126,
-                8.432078,
-                7.4402075,
-                9.38451,
-                6.271159,
-                8.827531,
-                5.4402995,
-                7.3857956,
-                9.677023,
-            ],
+            [8.222099, 7.3534126, 8.432078, 7.4402075, 9.38451, 6.271159, 8.827531, 5.4402995, 7.3857956, 9.677023],
             dtype=tf.float32,
         )  # expected non filtered values as noted above
 
@@ -1650,31 +1419,19 @@ def tearDownClass(cls):
             pass
 
         try:
-            delete_repo(
-                token=cls._token,
-                name="test-model-tf-org",
-                organization="valid_org",
-            )
+            delete_repo(token=cls._token, name="test-model-tf-org", organization="valid_org")
         except HTTPError:
             pass
 
     def test_push_to_hub(self):
         config = BertConfig(
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
         )
         model = TFBertModel(config)
         # Make sure model is properly initialized
         _ = model(model.dummy_inputs)
         with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                os.path.join(tmp_dir, "test-model-tf"),
-                push_to_hub=True,
-                use_auth_token=self._token,
-            )
+            model.save_pretrained(os.path.join(tmp_dir, "test-model-tf"), push_to_hub=True, use_auth_token=self._token)
 
             new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
             models_equal = True
@@ -1685,11 +1442,7 @@ def test_push_to_hub(self):
 
     def test_push_to_hub_with_model_card(self):
         config = BertConfig(
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
         )
         model = TFBertModel(config)
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -1698,11 +1451,7 @@ def test_push_to_hub_with_model_card(self):
 
     def test_push_to_hub_in_organization(self):
         config = BertConfig(
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
         )
         model = TFBertModel(config)
         with tempfile.TemporaryDirectory() as tmp_dir:

From f4292b45353051910aeb75c6767e9f2335f2706c Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 25 Feb 2022 07:49:33 +0530
Subject: [PATCH 59/65] fix: tests for convnext.

---
 tests/test_modeling_tf_common.py | 395 +++++++++++++++++++++++++------
 1 file changed, 323 insertions(+), 72 deletions(-)

diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 142bff7cae06e..2038f29e56cf8 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -83,7 +83,8 @@
             # Restrict TensorFlow to only allocate x GB of memory on the GPUs
             try:
                 tf.config.set_logical_device_configuration(
-                    gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)]
+                    gpu,
+                    [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)],
                 )
                 logical_gpus = tf.config.list_logical_devices("GPU")
                 print("Logical GPUs", logical_gpus)
@@ -116,7 +117,10 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d
 
         if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
             inputs_dict = {
-                k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
+                k: tf.tile(
+                    tf.expand_dims(v, 1),
+                    (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1),
+                )
                 if isinstance(v, tf.Tensor) and v.ndim > 0
                 else v
                 for k, v in inputs_dict.items()
@@ -144,7 +148,11 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d
                 *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING),
             ]:
                 inputs_dict["labels"] = tf.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
+                    (
+                        self.model_tester.batch_size,
+                        self.model_tester.seq_length,
+                    ),
+                    dtype=tf.int32,
                 )
         return inputs_dict
 
@@ -152,7 +160,10 @@ def test_initialization(self):
         pass
 
     def test_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -166,7 +177,10 @@ def test_save_load(self):
                 self.assert_outputs_same(after_outputs, outputs)
 
     def test_save_load_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -218,7 +232,10 @@ def test_onnx_compliancy(self):
         if not self.test_onnx:
             return
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         INTERNAL_OPS = [
             "Assert",
             "AssignVariableOp",
@@ -265,7 +282,10 @@ def test_onnx_runtime_optimize(self):
         import onnxruntime
         import tf2onnx
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -276,7 +296,10 @@ def test_onnx_runtime_optimize(self):
             onnxruntime.InferenceSession(onnx_model_proto.SerializeToString())
 
     def test_keras_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         tf_main_layer_classes = set(
             module_member
@@ -321,7 +344,8 @@ def test_keras_save_load(self):
                     )
                 else:
                     model = tf.keras.models.load_model(
-                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
+                        filepath,
+                        custom_objects={main_layer_class.__name__: main_layer_class},
                     )
                 assert isinstance(model, tf.keras.Model)
                 after_outputs = model(inputs_dict)
@@ -348,7 +372,10 @@ def test_pt_tf_model_equivalence(self):
 
         import transformers
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
@@ -361,7 +388,9 @@ def test_pt_tf_model_equivalence(self):
 
             # Check we can load pt model in tf and vice-versa with model => model functions
             tf_model = transformers.load_pytorch_model_in_tf2_model(
-                tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class)
+                tf_model,
+                pt_model,
+                tf_inputs=self._prepare_for_class(inputs_dict, model_class),
             )
             pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
 
@@ -382,7 +411,10 @@ def test_pt_tf_model_equivalence(self):
 
             with torch.no_grad():
                 pto = pt_model(**pt_inputs_dict)
-            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False)
+            tfo = tf_model(
+                self._prepare_for_class(inputs_dict, model_class),
+                training=False,
+            )
 
             tf_hidden_states = tfo[0].numpy()
             pt_hidden_states = pto[0].numpy()
@@ -441,14 +473,20 @@ def test_pt_tf_model_equivalence(self):
             self.assertLessEqual(max_diff, 4e-2)
 
     def test_compile_tf_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         max_input = getattr(self.model_tester, "max_position_embeddings", 512)
         optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
         loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
         metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
 
         for model_class in self.all_model_classes:
-            if model_class.__name__ in ["TFSpeech2TextModel", "TFSpeech2TextForConditionalGeneration"]:
+            if model_class.__name__ in [
+                "TFSpeech2TextModel",
+                "TFSpeech2TextForConditionalGeneration",
+            ]:
                 inputs = {
                     "decoder_input_ids": tf.keras.Input(
                         batch_shape=(2, max_input),
@@ -472,7 +510,11 @@ def test_compile_tf_model(self):
                         name="decoder_input_ids",
                         dtype="int32",
                     ),
-                    "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
+                    "input_ids": tf.keras.Input(
+                        batch_shape=(2, max_input),
+                        name="input_ids",
+                        dtype="int32",
+                    ),
                 }
             # `pixel_values` implies that the input is an image
             elif model_class.main_input_name == "pixel_values":
@@ -488,7 +530,11 @@ def test_compile_tf_model(self):
                 )
             elif model_class.__name__ in ["TFCLIPModel"]:
                 inputs = {
-                    "input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"),
+                    "input_ids": tf.keras.Input(
+                        batch_shape=(3, max_input),
+                        name="input_ids",
+                        dtype="int32",
+                    ),
                     "pixel_values": tf.keras.Input(
                         batch_shape=(
                             3,
@@ -501,7 +547,11 @@ def test_compile_tf_model(self):
                     ),
                 }
             elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32")
+                inputs = tf.keras.Input(
+                    batch_shape=(4, 2, max_input),
+                    name="input_ids",
+                    dtype="int32",
+                )
             else:
                 inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
 
@@ -524,7 +574,10 @@ def test_compile_tf_model(self):
             extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 
     def test_keyword_and_dict_args(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -540,10 +593,21 @@ def test_keyword_and_dict_args(self):
             self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
 
     def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
+        decoder_seq_length = getattr(
+            self.model_tester,
+            "decoder_seq_length",
+            self.model_tester.seq_length,
+        )
+        encoder_seq_length = getattr(
+            self.model_tester,
+            "encoder_seq_length",
+            self.model_tester.seq_length,
+        )
         decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
         encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
 
@@ -554,7 +618,11 @@ def check_decoder_attentions_output(outputs):
             self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
             self.assertListEqual(
                 list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                [
+                    self.model_tester.num_attention_heads,
+                    decoder_seq_length,
+                    decoder_key_length,
+                ],
             )
 
         def check_encoder_attentions_output(outputs):
@@ -564,7 +632,11 @@ def check_encoder_attentions_output(outputs):
             self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
             self.assertListEqual(
                 list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                [
+                    self.model_tester.num_attention_heads,
+                    encoder_seq_length,
+                    encoder_key_length,
+                ],
             )
 
         for model_class in self.all_model_classes:
@@ -606,7 +678,10 @@ def test_headmasking(self):
             return
 
         random.Random().seed(42)
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         random.Random().seed()
 
         inputs_dict["output_attentions"] = True
@@ -619,11 +694,19 @@ def test_headmasking(self):
             def prepare_layer_head_mask(i, attention_heads, num_hidden_layers):
                 if i == 0:
                     return tf.concat(
-                        (tf.zeros(1, dtype=tf.float32), tf.ones(attention_heads - 1, dtype=tf.float32)), 0
+                        (
+                            tf.zeros(1, dtype=tf.float32),
+                            tf.ones(attention_heads - 1, dtype=tf.float32),
+                        ),
+                        0,
                     )
                 elif i == num_hidden_layers - 1:
                     return tf.concat(
-                        (tf.zeros(attention_heads - 1, dtype=tf.float32), tf.ones(1, dtype=tf.float32)), 0
+                        (
+                            tf.zeros(attention_heads - 1, dtype=tf.float32),
+                            tf.ones(1, dtype=tf.float32),
+                        ),
+                        0,
                     )
                 else:
                     return tf.ones(attention_heads, dtype=tf.float32)
@@ -652,7 +735,8 @@ def check_attentions_validity(attentions):
                 # Remove Nan
                 for t in attentions:
                     self.assertLess(
-                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), (tf.size(t) / 4).numpy()
+                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(),
+                        (tf.size(t) / 4).numpy(),
                     )  # Check we don't have more than 25% nans (arbitrary)
 
                 attentions = [
@@ -660,11 +744,23 @@ def check_attentions_validity(attentions):
                 ]  # remove them (the test is less complete)
 
                 self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0)
-                self.assertNotEqual(tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), 0.0)
+                self.assertNotEqual(
+                    tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(),
+                    0.0,
+                )
                 if len(attentions) > 2:  # encoder-decodere models have only 2 layers in each modules
-                    self.assertNotEqual(tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), 0.0)
-                self.assertAlmostEqual(tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), 0.0)
-                self.assertNotEqual(tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), 0.0)
+                    self.assertNotEqual(
+                        tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(),
+                        0.0,
+                    )
+                self.assertAlmostEqual(
+                    tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(),
+                    0.0,
+                )
+                self.assertNotEqual(
+                    tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(),
+                    0.0,
+                )
 
             if model.config.is_encoder_decoder:
                 check_attentions_validity(outputs.encoder_attentions)
@@ -675,13 +771,18 @@ def check_attentions_validity(attentions):
                 check_attentions_validity(outputs.attentions)
 
     def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_hidden_states_output(config, inputs_dict, model_class):
             model = model_class(config)
             outputs = model(self._prepare_for_class(inputs_dict, model_class))
             expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+                self.model_tester,
+                "expected_num_hidden_layers",
+                self.model_tester.num_hidden_layers + 1,
             )
 
             if model.config.is_encoder_decoder:
@@ -692,12 +793,18 @@ def check_hidden_states_output(config, inputs_dict, model_class):
                 self.assertEqual(len(encoder_hidden_states), expected_num_layers)
                 self.assertListEqual(
                     list(encoder_hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                    [
+                        self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
                 )
                 self.assertEqual(len(decoder_hidden_states), expected_num_layers)
                 self.assertListEqual(
                     list(decoder_hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                    [
+                        self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
                 )
             else:
                 hidden_states = outputs.hidden_states
@@ -705,7 +812,10 @@ def check_hidden_states_output(config, inputs_dict, model_class):
                 self.assertEqual(len(hidden_states), expected_num_layers)
                 self.assertListEqual(
                     list(hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                    [
+                        self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
                 )
 
         for model_class in self.all_model_classes:
@@ -717,7 +827,10 @@ def check_hidden_states_output(config, inputs_dict, model_class):
             check_hidden_states_output(config, inputs_dict, model_class)
 
     def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         text_in_text_out_models = (
             get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
             + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
@@ -747,13 +860,22 @@ def test_model_common_attributes(self):
                 assert name is None
 
     def test_determinism(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
             first, second = (
-                model(self._prepare_for_class(inputs_dict, model_class), training=False)[0],
-                model(self._prepare_for_class(inputs_dict, model_class), training=False)[0],
+                model(
+                    self._prepare_for_class(inputs_dict, model_class),
+                    training=False,
+                )[0],
+                model(
+                    self._prepare_for_class(inputs_dict, model_class),
+                    training=False,
+                )[0],
             )
             out_1 = first.numpy()
             out_2 = second.numpy()
@@ -764,7 +886,10 @@ def test_determinism(self):
 
     def test_model_outputs_equivalence(self):
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
             tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
@@ -814,11 +939,17 @@ def recursive_check(tuple_object, dict_object):
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             check_equivalence(
-                model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
+                model,
+                tuple_inputs,
+                dict_inputs,
+                {"output_hidden_states": True, "output_attentions": True},
             )
 
     def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -845,7 +976,10 @@ def test_inputs_embeds(self):
             model(inputs)
 
     def test_numpy_arrays_inputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         def prepare_numpy_arrays(inputs_dict):
             inputs_np_dict = {}
@@ -870,7 +1004,10 @@ def prepare_numpy_arrays(inputs_dict):
     def test_resize_token_embeddings(self):
         if not self.test_resize_embeddings:
             return
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         def _get_word_embedding_weight(model, embedding_layer):
             embeds = getattr(embedding_layer, "weight", None)
@@ -929,16 +1066,25 @@ def _get_word_embedding_weight(model, embedding_layer):
 
                 if old_output_embeddings is not None and new_output_embeddings is not None:
                     self.assertEqual(new_output_embeddings.shape[0], assert_size)
-                    self.assertEqual(new_output_embeddings.shape[1], old_output_embeddings.shape[1])
+                    self.assertEqual(
+                        new_output_embeddings.shape[1],
+                        old_output_embeddings.shape[1],
+                    )
 
                     models_equal = True
-                    for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
+                    for p1, p2 in zip(
+                        old_output_embeddings.value(),
+                        new_output_embeddings.value(),
+                    ):
                         if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
                             models_equal = False
                     self.assertTrue(models_equal)
 
     def test_lm_head_model_random_no_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
 
         # iterate over all generative models
@@ -965,16 +1111,25 @@ def test_lm_head_model_random_no_beam_search_generate(self):
 
             # check bad words tokens language generation
             # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
+            bad_words_ids = [
+                self._generate_random_bad_tokens(1, model),
+                self._generate_random_bad_tokens(2, model),
+            ]
             output_tokens = model.generate(
-                input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
+                input_ids,
+                do_sample=True,
+                bad_words_ids=bad_words_ids,
+                num_return_sequences=2,
             )
             # only count generated tokens
             generated_ids = output_tokens[:, input_ids.shape[-1] :]
             self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
 
     def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
         if input_ids is None:
             input_ids = inputs_dict.get("input_features", None)
@@ -1007,7 +1162,10 @@ def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
                 self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput)
 
     def test_lm_head_model_random_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
 
         for model_class in self.all_generative_model_classes:
@@ -1022,7 +1180,12 @@ def test_lm_head_model_random_beam_search_generate(self):
 
             with self.assertRaises(AssertionError):
                 # generating more sequences than having beams leads is not possible
-                model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
+                model.generate(
+                    input_ids,
+                    do_sample=False,
+                    num_return_sequences=3,
+                    num_beams=2,
+                )
 
             # num_return_sequences > 1, sample
             self._check_generated_ids(
@@ -1034,20 +1197,37 @@ def test_lm_head_model_random_beam_search_generate(self):
                 )
             )
             # num_return_sequences > 1, greedy
-            self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2))
+            self._check_generated_ids(
+                model.generate(
+                    input_ids,
+                    do_sample=False,
+                    num_beams=2,
+                    num_return_sequences=2,
+                )
+            )
 
             # check bad words tokens language generation
             # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
+            bad_words_ids = [
+                self._generate_random_bad_tokens(1, model),
+                self._generate_random_bad_tokens(2, model),
+            ]
             output_tokens = model.generate(
-                input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
+                input_ids,
+                do_sample=False,
+                bad_words_ids=bad_words_ids,
+                num_beams=2,
+                num_return_sequences=2,
             )
             # only count generated tokens
             generated_ids = output_tokens[:, input_ids.shape[-1] :]
             self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
 
     def test_lm_head_model_beam_search_generate_dict_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
         if input_ids is None:
             input_ids = inputs_dict.get("input_features", None)
@@ -1082,14 +1262,20 @@ def test_lm_head_model_beam_search_generate_dict_outputs(self):
                 self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput)
 
     def test_loss_computation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config)
             if getattr(model, "hf_compute_loss", None):
                 # The number of elements in the loss should be the same as the number of elements in the label
                 prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
                 added_label = prepared_for_class[
-                    sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]
+                    sorted(
+                        list(prepared_for_class.keys() - inputs_dict.keys()),
+                        reverse=True,
+                    )[0]
                 ]
                 loss_size = tf.size(added_label)
 
@@ -1100,7 +1286,11 @@ def test_loss_computation(self):
 
                 # Test that model correctly compute the loss with kwargs
                 prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                possible_input_names = {"input_ids", "pixel_values", "input_features"}
+                possible_input_names = {
+                    "input_ids",
+                    "pixel_values",
+                    "input_features",
+                }
                 input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
                 model_input = prepared_for_class.pop(input_name)
 
@@ -1144,8 +1334,15 @@ def test_loss_computation(self):
                 self.assertEqual(loss.shape, [loss_size])
 
     def test_generate_with_headmasking(self):
-        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        attention_names = [
+            "encoder_attentions",
+            "decoder_attentions",
+            "cross_attentions",
+        ]
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_generative_model_classes:
             model = model_class(config)
@@ -1180,7 +1377,10 @@ def test_generate_with_headmasking(self):
     def test_load_with_mismatched_shapes(self):
         if not self.test_mismatched_shapes:
             return
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
@@ -1287,7 +1487,13 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
 def random_attention_mask(shape, rng=None, name=None, dtype=None):
     attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype)
     # make sure that at least one token is attended to for each batch
-    attn_mask = tf.concat([tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), attn_mask[:, 1:]], axis=1)
+    attn_mask = tf.concat(
+        [
+            tf.constant(value=1, shape=(shape[0], 1), dtype=dtype),
+            attn_mask[:, 1:],
+        ],
+        axis=1,
+    )
     return attn_mask
 
 
@@ -1304,7 +1510,10 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
     for _ in range(total_dims):
         values.append(rng.random() * scale)
 
-    return tf.reshape(tf.constant(values, dtype=dtype if dtype is not None else tf.float32), shape=shape)
+    return tf.reshape(
+        tf.constant(values, dtype=dtype if dtype is not None else tf.float32),
+        shape=shape,
+    )
 
 
 @require_tf
@@ -1383,12 +1592,34 @@ def test_top_k_top_p_filtering(self):
         )
 
         non_inf_expected_idx = tf.convert_to_tensor(
-            [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]],
+            [
+                [0, 0],
+                [0, 9],
+                [0, 10],
+                [0, 25],
+                [0, 26],
+                [1, 13],
+                [1, 17],
+                [1, 18],
+                [1, 20],
+                [1, 27],
+            ],
             dtype=tf.int32,
         )  # expected non filtered idx as noted above
 
         non_inf_expected_output = tf.convert_to_tensor(
-            [8.222099, 7.3534126, 8.432078, 7.4402075, 9.38451, 6.271159, 8.827531, 5.4402995, 7.3857956, 9.677023],
+            [
+                8.222099,
+                7.3534126,
+                8.432078,
+                7.4402075,
+                9.38451,
+                6.271159,
+                8.827531,
+                5.4402995,
+                7.3857956,
+                9.677023,
+            ],
             dtype=tf.float32,
         )  # expected non filtered values as noted above
 
@@ -1419,19 +1650,31 @@ def tearDownClass(cls):
             pass
 
         try:
-            delete_repo(token=cls._token, name="test-model-tf-org", organization="valid_org")
+            delete_repo(
+                token=cls._token,
+                name="test-model-tf-org",
+                organization="valid_org",
+            )
         except HTTPError:
             pass
 
     def test_push_to_hub(self):
         config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
         )
         model = TFBertModel(config)
         # Make sure model is properly initialized
         _ = model(model.dummy_inputs)
         with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, "test-model-tf"), push_to_hub=True, use_auth_token=self._token)
+            model.save_pretrained(
+                os.path.join(tmp_dir, "test-model-tf"),
+                push_to_hub=True,
+                use_auth_token=self._token,
+            )
 
             new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
             models_equal = True
@@ -1442,7 +1685,11 @@ def test_push_to_hub(self):
 
     def test_push_to_hub_with_model_card(self):
         config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
         )
         model = TFBertModel(config)
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -1451,7 +1698,11 @@ def test_push_to_hub_with_model_card(self):
 
     def test_push_to_hub_in_organization(self):
         config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
         )
         model = TFBertModel(config)
         with tempfile.TemporaryDirectory() as tmp_dir:

From 8b99c8e63f05469e1e33d856ad185c7613c111ad Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 25 Feb 2022 13:26:32 +0530
Subject: [PATCH 60/65] chore: removed output_attentions argument from convnext
 config.

---
 src/transformers/modeling_tf_utils.py         | 127 ++++++++++++++----
 .../models/convnext/configuration_convnext.py |   1 -
 2 files changed, 101 insertions(+), 27 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 8d2ad8d10c081..9d392ec6e4ff0 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -312,9 +312,10 @@ def booleans_processing(config, **kwargs):
 
     if tf.executing_eagerly():
         # Pure conv models (such as ConvNext) do not have `output_attentions`
-        final_booleans["output_attentions"] = (
-            kwargs["output_attentions"] if kwargs["output_attentions"] is not None else config.output_attentions
-        )
+        final_booleans["output_attentions"] = kwargs.get("output_attentions", None)
+        if final_booleans["output_attentions"] is None:
+            final_booleans["output_attentions"] = config.output_attentions
+
         final_booleans["output_hidden_states"] = (
             kwargs["output_hidden_states"]
             if kwargs["output_hidden_states"] is not None
@@ -366,7 +367,17 @@ def input_processing(func, config, input_ids, **kwargs):
     signature.pop("self", None)
     parameter_names = list(signature.keys())
     output = {}
-    allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray, KerasTensor)
+    allowed_types = (
+        tf.Tensor,
+        bool,
+        int,
+        ModelOutput,
+        tuple,
+        list,
+        dict,
+        np.ndarray,
+        KerasTensor,
+    )
 
     if "inputs" in kwargs["kwargs_call"]:
         warnings.warn(
@@ -479,7 +490,13 @@ def input_processing(func, config, input_ids, **kwargs):
     boolean_dict = {
         k: v
         for k, v in output.items()
-        if k in ["return_dict", "output_attentions", "output_hidden_states", "use_cache"]
+        if k
+        in [
+            "return_dict",
+            "output_attentions",
+            "output_hidden_states",
+            "use_cache",
+        ]
     }
 
     output.update(
@@ -578,11 +595,18 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False,
                             # If yes we reshape the weight from the H5 file accordingly to the current weight
                             # If the two shapes are not compatible we raise an issue
                             try:
-                                array = np.reshape(saved_weight_value, K.int_shape(symbolic_weight))
+                                array = np.reshape(
+                                    saved_weight_value,
+                                    K.int_shape(symbolic_weight),
+                                )
                             except ValueError as e:
                                 if ignore_mismatched_sizes:
                                     mismatched_layers.append(
-                                        (symbolic_weight_name, saved_weight_value.shape, K.int_shape(symbolic_weight))
+                                        (
+                                            symbolic_weight_name,
+                                            saved_weight_value.shape,
+                                            K.int_shape(symbolic_weight),
+                                        )
                                     )
                                     continue
                                 else:
@@ -626,11 +650,17 @@ def init_copy_embeddings(old_embeddings, new_num_tokens):
         # and we create a mask to properly identify the padded values and be replaced by the values of the newly created
         # embeddings
         current_weights = tf.pad(
-            old_embeddings.value(), tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=-1
+            old_embeddings.value(),
+            tf.convert_to_tensor([[0, size_diff], [0, 0]]),
+            constant_values=-1,
         )
         num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
         mask = tf.fill(tf.convert_to_tensor([num_tokens_to_copy, 1]), True)
-        mask = tf.pad(mask, tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=False)
+        mask = tf.pad(
+            mask,
+            tf.convert_to_tensor([[0, size_diff], [0, 0]]),
+            constant_values=False,
+        )
     else:
         # if the new size if lower than the old one, we take the current embeddings until the new size
         current_weights = tf.slice(
@@ -775,7 +805,10 @@ def _save_checkpoint(self, checkpoint_dir, epoch):
         # internally and which users are likely to use too
         weights_path = os.path.join(checkpoint_dir, "weights.h5")
         self.save_weights(weights_path)
-        extra_data = {"epoch": epoch, "optimizer_state": self.optimizer.get_weights()}
+        extra_data = {
+            "epoch": epoch,
+            "optimizer_state": self.optimizer.get_weights(),
+        }
         extra_data_path = os.path.join(checkpoint_dir, "extra_data.pickle")
         with open(extra_data_path, "wb") as f:
             pickle.dump(extra_data, f)
@@ -801,7 +834,10 @@ def load_repo_checkpoint(self, repo_path_or_name):
         if not os.path.isdir(repo_path_or_name):
             # If this isn't a local path, check that the remote repo exists and has a checkpoint in it
             repo_files = list_repo_files(repo_path_or_name)
-            for file in ("checkpoint/weights.h5", "checkpoint/extra_data.pickle"):
+            for file in (
+                "checkpoint/weights.h5",
+                "checkpoint/extra_data.pickle",
+            ):
                 if file not in repo_files:
                     raise FileNotFoundError(f"Repo {repo_path_or_name} does not contain checkpoint file {file}!")
             if "/" not in repo_path_or_name:
@@ -809,7 +845,10 @@ def load_repo_checkpoint(self, repo_path_or_name):
                 repo_path_or_name = self.get_full_repo_name(repo_path_or_name)
             else:
                 model_id = repo_path_or_name.split("/")[-1]
-            repo = Repository(model_id, clone_from=f"https://huggingface.co/{repo_path_or_name}")
+            repo = Repository(
+                model_id,
+                clone_from=f"https://huggingface.co/{repo_path_or_name}",
+            )
             local_dir = repo.local_dir
         else:
             local_dir = repo_path_or_name
@@ -1066,7 +1105,8 @@ def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]:
             `tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
         """
         warnings.warn(
-            "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning
+            "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.",
+            FutureWarning,
         )
         return self.get_lm_head()
 
@@ -1077,7 +1117,10 @@ def get_prefix_bias_name(self) -> Union[None, str]:
         Return:
             `str`: The _prefix name of the bias.
         """
-        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        warnings.warn(
+            "The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.",
+            FutureWarning,
+        )
         return None
 
     def get_bias(self) -> Union[None, Dict[str, tf.Variable]]:
@@ -1225,15 +1268,25 @@ def _get_resized_lm_head_bias(self, old_lm_head_bias, new_num_tokens):
             # initialize new bias
             if tf.math.greater(size_diff, 0):
                 padding_shape = [[0, size_diff]] if first_dim is None else [[0, 0], [0, size_diff]]
-                current_bias = tf.pad(weight.value(), tf.convert_to_tensor(padding_shape), constant_values=-1)
+                current_bias = tf.pad(
+                    weight.value(),
+                    tf.convert_to_tensor(padding_shape),
+                    constant_values=-1,
+                )
                 num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
                 mask_shape = [num_tokens_to_copy] if first_dim is None else [1, num_tokens_to_copy]
                 bias_mask = tf.fill(tf.convert_to_tensor(mask_shape), True)
-                bias_mask = tf.pad(bias_mask, tf.convert_to_tensor(padding_shape), constant_values=False)
+                bias_mask = tf.pad(
+                    bias_mask,
+                    tf.convert_to_tensor(padding_shape),
+                    constant_values=False,
+                )
             else:
                 slice_from = [0] if first_dim is None else [0, 0]
                 current_bias = tf.slice(
-                    weight.value(), tf.convert_to_tensor(slice_from), tf.convert_to_tensor(final_shape)
+                    weight.value(),
+                    tf.convert_to_tensor(slice_from),
+                    tf.convert_to_tensor(final_shape),
                 )
                 bias_mask = tf.fill(tf.convert_to_tensor(final_shape), True)
 
@@ -1374,7 +1427,11 @@ def save_pretrained(self, save_directory, saved_model=False, version=1, push_to_
 
         if saved_model:
             saved_model_dir = os.path.join(save_directory, "saved_model", str(version))
-            self.save(saved_model_dir, include_optimizer=False, signatures=self.serving)
+            self.save(
+                saved_model_dir,
+                include_optimizer=False,
+                signatures=self.serving,
+            )
             logger.info(f"Saved model created in {saved_model_dir}")
 
         # Save configuration file
@@ -1526,7 +1583,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
 
-        user_agent = {"file_type": "model", "framework": "tensorflow", "from_auto_class": from_auto_class}
+        user_agent = {
+            "file_type": "model",
+            "framework": "tensorflow",
+            "from_auto_class": from_auto_class,
+        }
         if from_pipeline is not None:
             user_agent["using_pipeline"] = from_pipeline
 
@@ -1622,7 +1683,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                         "proxies": proxies,
                         "use_auth_token": use_auth_token,
                     }
-                    if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs):
+                    if has_file(
+                        pretrained_model_name_or_path,
+                        WEIGHTS_NAME,
+                        **has_file_kwargs,
+                    ):
                         raise EnvironmentError(
                             f"{pretrained_model_name_or_path} does not appear to have a file named {TF2_WEIGHTS_NAME} "
                             "but there is a file for PyTorch weights. Use `from_pt=True` to load this model from "
@@ -1772,7 +1837,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 # To update the docstring, we need to copy the method, otherwise we change the original docstring.
 TFPreTrainedModel.push_to_hub = copy_func(TFPreTrainedModel.push_to_hub)
 TFPreTrainedModel.push_to_hub.__doc__ = TFPreTrainedModel.push_to_hub.__doc__.format(
-    object="model", object_class="TFAutoModel", object_files="model checkpoint"
+    object="model",
+    object_class="TFAutoModel",
+    object_files="model checkpoint",
 )
 
 
@@ -1801,7 +1868,9 @@ def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
 
     def build(self, input_shape):
         self.weight = self.add_weight(
-            "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range)
+            "weight",
+            shape=[self.nx, self.nf],
+            initializer=get_initializer(self.initializer_range),
         )
         self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer())
 
@@ -1839,7 +1908,7 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optiona
         super().__init__(**kwargs)
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
-        self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
+        self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
 
     def build(self, input_shape):
         """
@@ -1847,7 +1916,9 @@ def build(self, input_shape):
         https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
         """
         self.weight = self.add_weight(
-            "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
+            "weight",
+            shape=[self.vocab_size, self.hidden_size],
+            initializer=get_initializer(self.initializer_range),
         )
         super().build(input_shape)
 
@@ -1961,7 +2032,9 @@ def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **
             else:
                 num_classes = config.hidden_size
             self.summary = tf.keras.layers.Dense(
-                num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
+                num_classes,
+                kernel_initializer=get_initializer(initializer_range),
+                name="summary",
             )
 
         self.has_activation = False
@@ -2056,7 +2129,9 @@ def register_for_auto_class(cls, auto_class="TFAutoModel"):
         cls._auto_class = auto_class
 
 
-def get_initializer(initializer_range: float = 0.02) -> tf.initializers.TruncatedNormal:
+def get_initializer(
+    initializer_range: float = 0.02,
+) -> tf.initializers.TruncatedNormal:
     """
     Creates a `tf.initializers.TruncatedNormal` with the given range.
 
diff --git a/src/transformers/models/convnext/configuration_convnext.py b/src/transformers/models/convnext/configuration_convnext.py
index c09a54e86a7e2..74067ad337bbf 100644
--- a/src/transformers/models/convnext/configuration_convnext.py
+++ b/src/transformers/models/convnext/configuration_convnext.py
@@ -101,4 +101,3 @@ def __init__(
         self.layer_scale_init_value = layer_scale_init_value
         self.drop_path_rate = drop_path_rate
         self.image_size = image_size
-        self.output_attentions = None

From 78198505f19c1abce0361ca972af95cb72574271 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 25 Feb 2022 17:17:28 +0530
Subject: [PATCH 61/65] chore: revert to the earlier tf utils.

---
 src/transformers/modeling_tf_utils.py | 121 +++++---------------------
 1 file changed, 23 insertions(+), 98 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 9d392ec6e4ff0..2ab3e79381171 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -315,7 +315,6 @@ def booleans_processing(config, **kwargs):
         final_booleans["output_attentions"] = kwargs.get("output_attentions", None)
         if final_booleans["output_attentions"] is None:
             final_booleans["output_attentions"] = config.output_attentions
-
         final_booleans["output_hidden_states"] = (
             kwargs["output_hidden_states"]
             if kwargs["output_hidden_states"] is not None
@@ -367,17 +366,7 @@ def input_processing(func, config, input_ids, **kwargs):
     signature.pop("self", None)
     parameter_names = list(signature.keys())
     output = {}
-    allowed_types = (
-        tf.Tensor,
-        bool,
-        int,
-        ModelOutput,
-        tuple,
-        list,
-        dict,
-        np.ndarray,
-        KerasTensor,
-    )
+    allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray, KerasTensor)
 
     if "inputs" in kwargs["kwargs_call"]:
         warnings.warn(
@@ -490,13 +479,7 @@ def input_processing(func, config, input_ids, **kwargs):
     boolean_dict = {
         k: v
         for k, v in output.items()
-        if k
-        in [
-            "return_dict",
-            "output_attentions",
-            "output_hidden_states",
-            "use_cache",
-        ]
+        if k in ["return_dict", "output_attentions", "output_hidden_states", "use_cache"]
     }
 
     output.update(
@@ -595,18 +578,11 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False,
                             # If yes we reshape the weight from the H5 file accordingly to the current weight
                             # If the two shapes are not compatible we raise an issue
                             try:
-                                array = np.reshape(
-                                    saved_weight_value,
-                                    K.int_shape(symbolic_weight),
-                                )
+                                array = np.reshape(saved_weight_value, K.int_shape(symbolic_weight))
                             except ValueError as e:
                                 if ignore_mismatched_sizes:
                                     mismatched_layers.append(
-                                        (
-                                            symbolic_weight_name,
-                                            saved_weight_value.shape,
-                                            K.int_shape(symbolic_weight),
-                                        )
+                                        (symbolic_weight_name, saved_weight_value.shape, K.int_shape(symbolic_weight))
                                     )
                                     continue
                                 else:
@@ -650,17 +626,11 @@ def init_copy_embeddings(old_embeddings, new_num_tokens):
         # and we create a mask to properly identify the padded values and be replaced by the values of the newly created
         # embeddings
         current_weights = tf.pad(
-            old_embeddings.value(),
-            tf.convert_to_tensor([[0, size_diff], [0, 0]]),
-            constant_values=-1,
+            old_embeddings.value(), tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=-1
         )
         num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
         mask = tf.fill(tf.convert_to_tensor([num_tokens_to_copy, 1]), True)
-        mask = tf.pad(
-            mask,
-            tf.convert_to_tensor([[0, size_diff], [0, 0]]),
-            constant_values=False,
-        )
+        mask = tf.pad(mask, tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=False)
     else:
         # if the new size if lower than the old one, we take the current embeddings until the new size
         current_weights = tf.slice(
@@ -805,10 +775,7 @@ def _save_checkpoint(self, checkpoint_dir, epoch):
         # internally and which users are likely to use too
         weights_path = os.path.join(checkpoint_dir, "weights.h5")
         self.save_weights(weights_path)
-        extra_data = {
-            "epoch": epoch,
-            "optimizer_state": self.optimizer.get_weights(),
-        }
+        extra_data = {"epoch": epoch, "optimizer_state": self.optimizer.get_weights()}
         extra_data_path = os.path.join(checkpoint_dir, "extra_data.pickle")
         with open(extra_data_path, "wb") as f:
             pickle.dump(extra_data, f)
@@ -834,10 +801,7 @@ def load_repo_checkpoint(self, repo_path_or_name):
         if not os.path.isdir(repo_path_or_name):
             # If this isn't a local path, check that the remote repo exists and has a checkpoint in it
             repo_files = list_repo_files(repo_path_or_name)
-            for file in (
-                "checkpoint/weights.h5",
-                "checkpoint/extra_data.pickle",
-            ):
+            for file in ("checkpoint/weights.h5", "checkpoint/extra_data.pickle"):
                 if file not in repo_files:
                     raise FileNotFoundError(f"Repo {repo_path_or_name} does not contain checkpoint file {file}!")
             if "/" not in repo_path_or_name:
@@ -845,10 +809,7 @@ def load_repo_checkpoint(self, repo_path_or_name):
                 repo_path_or_name = self.get_full_repo_name(repo_path_or_name)
             else:
                 model_id = repo_path_or_name.split("/")[-1]
-            repo = Repository(
-                model_id,
-                clone_from=f"https://huggingface.co/{repo_path_or_name}",
-            )
+            repo = Repository(model_id, clone_from=f"https://huggingface.co/{repo_path_or_name}")
             local_dir = repo.local_dir
         else:
             local_dir = repo_path_or_name
@@ -1105,8 +1066,7 @@ def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]:
             `tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
         """
         warnings.warn(
-            "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.",
-            FutureWarning,
+            "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning
         )
         return self.get_lm_head()
 
@@ -1117,10 +1077,7 @@ def get_prefix_bias_name(self) -> Union[None, str]:
         Return:
             `str`: The _prefix name of the bias.
         """
-        warnings.warn(
-            "The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.",
-            FutureWarning,
-        )
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
         return None
 
     def get_bias(self) -> Union[None, Dict[str, tf.Variable]]:
@@ -1268,25 +1225,15 @@ def _get_resized_lm_head_bias(self, old_lm_head_bias, new_num_tokens):
             # initialize new bias
             if tf.math.greater(size_diff, 0):
                 padding_shape = [[0, size_diff]] if first_dim is None else [[0, 0], [0, size_diff]]
-                current_bias = tf.pad(
-                    weight.value(),
-                    tf.convert_to_tensor(padding_shape),
-                    constant_values=-1,
-                )
+                current_bias = tf.pad(weight.value(), tf.convert_to_tensor(padding_shape), constant_values=-1)
                 num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
                 mask_shape = [num_tokens_to_copy] if first_dim is None else [1, num_tokens_to_copy]
                 bias_mask = tf.fill(tf.convert_to_tensor(mask_shape), True)
-                bias_mask = tf.pad(
-                    bias_mask,
-                    tf.convert_to_tensor(padding_shape),
-                    constant_values=False,
-                )
+                bias_mask = tf.pad(bias_mask, tf.convert_to_tensor(padding_shape), constant_values=False)
             else:
                 slice_from = [0] if first_dim is None else [0, 0]
                 current_bias = tf.slice(
-                    weight.value(),
-                    tf.convert_to_tensor(slice_from),
-                    tf.convert_to_tensor(final_shape),
+                    weight.value(), tf.convert_to_tensor(slice_from), tf.convert_to_tensor(final_shape)
                 )
                 bias_mask = tf.fill(tf.convert_to_tensor(final_shape), True)
 
@@ -1427,11 +1374,7 @@ def save_pretrained(self, save_directory, saved_model=False, version=1, push_to_
 
         if saved_model:
             saved_model_dir = os.path.join(save_directory, "saved_model", str(version))
-            self.save(
-                saved_model_dir,
-                include_optimizer=False,
-                signatures=self.serving,
-            )
+            self.save(saved_model_dir, include_optimizer=False, signatures=self.serving)
             logger.info(f"Saved model created in {saved_model_dir}")
 
         # Save configuration file
@@ -1583,11 +1526,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
 
-        user_agent = {
-            "file_type": "model",
-            "framework": "tensorflow",
-            "from_auto_class": from_auto_class,
-        }
+        user_agent = {"file_type": "model", "framework": "tensorflow", "from_auto_class": from_auto_class}
         if from_pipeline is not None:
             user_agent["using_pipeline"] = from_pipeline
 
@@ -1683,11 +1622,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                         "proxies": proxies,
                         "use_auth_token": use_auth_token,
                     }
-                    if has_file(
-                        pretrained_model_name_or_path,
-                        WEIGHTS_NAME,
-                        **has_file_kwargs,
-                    ):
+                    if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs):
                         raise EnvironmentError(
                             f"{pretrained_model_name_or_path} does not appear to have a file named {TF2_WEIGHTS_NAME} "
                             "but there is a file for PyTorch weights. Use `from_pt=True` to load this model from "
@@ -1837,9 +1772,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 # To update the docstring, we need to copy the method, otherwise we change the original docstring.
 TFPreTrainedModel.push_to_hub = copy_func(TFPreTrainedModel.push_to_hub)
 TFPreTrainedModel.push_to_hub.__doc__ = TFPreTrainedModel.push_to_hub.__doc__.format(
-    object="model",
-    object_class="TFAutoModel",
-    object_files="model checkpoint",
+    object="model", object_class="TFAutoModel", object_files="model checkpoint"
 )
 
 
@@ -1868,9 +1801,7 @@ def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
 
     def build(self, input_shape):
         self.weight = self.add_weight(
-            "weight",
-            shape=[self.nx, self.nf],
-            initializer=get_initializer(self.initializer_range),
+            "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range)
         )
         self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer())
 
@@ -1908,7 +1839,7 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optiona
         super().__init__(**kwargs)
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
-        self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
+        self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
 
     def build(self, input_shape):
         """
@@ -1916,9 +1847,7 @@ def build(self, input_shape):
         https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
         """
         self.weight = self.add_weight(
-            "weight",
-            shape=[self.vocab_size, self.hidden_size],
-            initializer=get_initializer(self.initializer_range),
+            "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
         )
         super().build(input_shape)
 
@@ -2032,9 +1961,7 @@ def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **
             else:
                 num_classes = config.hidden_size
             self.summary = tf.keras.layers.Dense(
-                num_classes,
-                kernel_initializer=get_initializer(initializer_range),
-                name="summary",
+                num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
             )
 
         self.has_activation = False
@@ -2129,9 +2056,7 @@ def register_for_auto_class(cls, auto_class="TFAutoModel"):
         cls._auto_class = auto_class
 
 
-def get_initializer(
-    initializer_range: float = 0.02,
-) -> tf.initializers.TruncatedNormal:
+def get_initializer(initializer_range: float = 0.02) -> tf.initializers.TruncatedNormal:
     """
     Creates a `tf.initializers.TruncatedNormal` with the given range.
 

From ba9484ff11c370859092d05f3013198691c03572 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 25 Feb 2022 17:59:37 +0530
Subject: [PATCH 62/65] fix: output shapes of the hidden states

---
 .../models/convnext/modeling_tf_convnext.py      | 16 ++++++++++++----
 tests/convnext/test_modeling_tf_convnext.py      |  4 ++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 328194dddbc2c..f97b493bf30cd 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -292,7 +292,9 @@ def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwa
         self.embeddings = TFConvNextEmbeddings(config, name="embeddings")
         self.encoder = TFConvNextEncoder(config, name="encoder")
         self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
-        self.pooler = tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None
+        # We are setting the `data_format` like so because from here on we will revert to the
+        # NCHW output format
+        self.pooler = tf.keras.layers.GlobalAvgPool2D(data_format="channels_first") if add_pooling_layer else None
 
     def call(
         self,
@@ -333,15 +335,21 @@ def call(
         )
 
         last_hidden_state = encoder_outputs[0]
+        # Change to NCHW output format have uniformity in the modules
+        last_hidden_state = tf.transpose(last_hidden_state, perm=(0, 3, 1, 2))
         pooled_output = self.layernorm(self.pooler(last_hidden_state))
 
+        # Change the other hidden state outputs to NCHW as well
+        if output_hidden_states:
+            hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]])
+
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
 
         return TFBaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,
             pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
+            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
         )
 
 
@@ -504,10 +512,10 @@ def call(
 
         # converts back NHWC -> NCHW, to match PT's output
         if not return_dict:
-            return (tf.transpose(outputs[0], perm=(0, 3, 1, 2)),) + outputs[1:]
+            return (outputs[0],) + outputs[1:]
 
         return TFBaseModelOutputWithPooling(
-            last_hidden_state=tf.transpose(outputs.last_hidden_state, perm=(0, 3, 1, 2)),
+            last_hidden_state=outputs.last_hidden_state,
             pooler_output=outputs.pooler_output,
             hidden_states=outputs.hidden_states,
         )
diff --git a/tests/convnext/test_modeling_tf_convnext.py b/tests/convnext/test_modeling_tf_convnext.py
index cfc2646176448..f5ccd1438af03 100644
--- a/tests/convnext/test_modeling_tf_convnext.py
+++ b/tests/convnext/test_modeling_tf_convnext.py
@@ -199,9 +199,9 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             expected_num_stages = self.model_tester.num_stages
             self.assertEqual(len(hidden_states), expected_num_stages + 1)
 
-            # ConvNext's feature maps are of shape (batch_size, height, width, num_channels) in TF
+            # ConvNext's feature maps are of shape (batch_size, num_channels, height, width)
             self.assertListEqual(
-                list(hidden_states[0].shape[1:-1]),
+                list(hidden_states[0].shape[-2:]),
                 [
                     self.model_tester.image_size // 4,
                     self.model_tester.image_size // 4,

From 553bac5342defc01ebc28f6947b9c85b8d0dcab5 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 25 Feb 2022 18:28:36 +0530
Subject: [PATCH 63/65] chore: removed unnecessary comment

---
 src/transformers/models/convnext/modeling_tf_convnext.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index f97b493bf30cd..c6e7f1311e532 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -510,7 +510,6 @@ def call(
             training=inputs["training"],
         )
 
-        # converts back NHWC -> NCHW, to match PT's output
         if not return_dict:
             return (outputs[0],) + outputs[1:]
 

From d22e0cbc8fbd07d8199f785c96f7598fd805f13f Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 25 Feb 2022 20:34:02 +0530
Subject: [PATCH 64/65] chore: reverting to the right
 test_modeling_tf_common.py.

---
 tests/test_modeling_tf_common.py | 395 ++++++-------------------------
 1 file changed, 72 insertions(+), 323 deletions(-)

diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 2038f29e56cf8..142bff7cae06e 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -83,8 +83,7 @@
             # Restrict TensorFlow to only allocate x GB of memory on the GPUs
             try:
                 tf.config.set_logical_device_configuration(
-                    gpu,
-                    [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)],
+                    gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)]
                 )
                 logical_gpus = tf.config.list_logical_devices("GPU")
                 print("Logical GPUs", logical_gpus)
@@ -117,10 +116,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d
 
         if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
             inputs_dict = {
-                k: tf.tile(
-                    tf.expand_dims(v, 1),
-                    (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1),
-                )
+                k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
                 if isinstance(v, tf.Tensor) and v.ndim > 0
                 else v
                 for k, v in inputs_dict.items()
@@ -148,11 +144,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d
                 *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING),
             ]:
                 inputs_dict["labels"] = tf.zeros(
-                    (
-                        self.model_tester.batch_size,
-                        self.model_tester.seq_length,
-                    ),
-                    dtype=tf.int32,
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
                 )
         return inputs_dict
 
@@ -160,10 +152,7 @@ def test_initialization(self):
         pass
 
     def test_save_load(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -177,10 +166,7 @@ def test_save_load(self):
                 self.assert_outputs_same(after_outputs, outputs)
 
     def test_save_load_config(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -232,10 +218,7 @@ def test_onnx_compliancy(self):
         if not self.test_onnx:
             return
 
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         INTERNAL_OPS = [
             "Assert",
             "AssignVariableOp",
@@ -282,10 +265,7 @@ def test_onnx_runtime_optimize(self):
         import onnxruntime
         import tf2onnx
 
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -296,10 +276,7 @@ def test_onnx_runtime_optimize(self):
             onnxruntime.InferenceSession(onnx_model_proto.SerializeToString())
 
     def test_keras_save_load(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         tf_main_layer_classes = set(
             module_member
@@ -344,8 +321,7 @@ def test_keras_save_load(self):
                     )
                 else:
                     model = tf.keras.models.load_model(
-                        filepath,
-                        custom_objects={main_layer_class.__name__: main_layer_class},
+                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
                     )
                 assert isinstance(model, tf.keras.Model)
                 after_outputs = model(inputs_dict)
@@ -372,10 +348,7 @@ def test_pt_tf_model_equivalence(self):
 
         import transformers
 
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
@@ -388,9 +361,7 @@ def test_pt_tf_model_equivalence(self):
 
             # Check we can load pt model in tf and vice-versa with model => model functions
             tf_model = transformers.load_pytorch_model_in_tf2_model(
-                tf_model,
-                pt_model,
-                tf_inputs=self._prepare_for_class(inputs_dict, model_class),
+                tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class)
             )
             pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
 
@@ -411,10 +382,7 @@ def test_pt_tf_model_equivalence(self):
 
             with torch.no_grad():
                 pto = pt_model(**pt_inputs_dict)
-            tfo = tf_model(
-                self._prepare_for_class(inputs_dict, model_class),
-                training=False,
-            )
+            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False)
 
             tf_hidden_states = tfo[0].numpy()
             pt_hidden_states = pto[0].numpy()
@@ -473,20 +441,14 @@ def test_pt_tf_model_equivalence(self):
             self.assertLessEqual(max_diff, 4e-2)
 
     def test_compile_tf_model(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         max_input = getattr(self.model_tester, "max_position_embeddings", 512)
         optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
         loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
         metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
 
         for model_class in self.all_model_classes:
-            if model_class.__name__ in [
-                "TFSpeech2TextModel",
-                "TFSpeech2TextForConditionalGeneration",
-            ]:
+            if model_class.__name__ in ["TFSpeech2TextModel", "TFSpeech2TextForConditionalGeneration"]:
                 inputs = {
                     "decoder_input_ids": tf.keras.Input(
                         batch_shape=(2, max_input),
@@ -510,11 +472,7 @@ def test_compile_tf_model(self):
                         name="decoder_input_ids",
                         dtype="int32",
                     ),
-                    "input_ids": tf.keras.Input(
-                        batch_shape=(2, max_input),
-                        name="input_ids",
-                        dtype="int32",
-                    ),
+                    "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
                 }
             # `pixel_values` implies that the input is an image
             elif model_class.main_input_name == "pixel_values":
@@ -530,11 +488,7 @@ def test_compile_tf_model(self):
                 )
             elif model_class.__name__ in ["TFCLIPModel"]:
                 inputs = {
-                    "input_ids": tf.keras.Input(
-                        batch_shape=(3, max_input),
-                        name="input_ids",
-                        dtype="int32",
-                    ),
+                    "input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"),
                     "pixel_values": tf.keras.Input(
                         batch_shape=(
                             3,
@@ -547,11 +501,7 @@ def test_compile_tf_model(self):
                     ),
                 }
             elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs = tf.keras.Input(
-                    batch_shape=(4, 2, max_input),
-                    name="input_ids",
-                    dtype="int32",
-                )
+                inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32")
             else:
                 inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
 
@@ -574,10 +524,7 @@ def test_compile_tf_model(self):
             extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 
     def test_keyword_and_dict_args(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -593,21 +540,10 @@ def test_keyword_and_dict_args(self):
             self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
 
     def test_attention_outputs(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
-        decoder_seq_length = getattr(
-            self.model_tester,
-            "decoder_seq_length",
-            self.model_tester.seq_length,
-        )
-        encoder_seq_length = getattr(
-            self.model_tester,
-            "encoder_seq_length",
-            self.model_tester.seq_length,
-        )
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
         decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
         encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
 
@@ -618,11 +554,7 @@ def check_decoder_attentions_output(outputs):
             self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
             self.assertListEqual(
                 list(decoder_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    decoder_seq_length,
-                    decoder_key_length,
-                ],
+                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
             )
 
         def check_encoder_attentions_output(outputs):
@@ -632,11 +564,7 @@ def check_encoder_attentions_output(outputs):
             self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
             self.assertListEqual(
                 list(attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    encoder_seq_length,
-                    encoder_key_length,
-                ],
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
             )
 
         for model_class in self.all_model_classes:
@@ -678,10 +606,7 @@ def test_headmasking(self):
             return
 
         random.Random().seed(42)
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         random.Random().seed()
 
         inputs_dict["output_attentions"] = True
@@ -694,19 +619,11 @@ def test_headmasking(self):
             def prepare_layer_head_mask(i, attention_heads, num_hidden_layers):
                 if i == 0:
                     return tf.concat(
-                        (
-                            tf.zeros(1, dtype=tf.float32),
-                            tf.ones(attention_heads - 1, dtype=tf.float32),
-                        ),
-                        0,
+                        (tf.zeros(1, dtype=tf.float32), tf.ones(attention_heads - 1, dtype=tf.float32)), 0
                     )
                 elif i == num_hidden_layers - 1:
                     return tf.concat(
-                        (
-                            tf.zeros(attention_heads - 1, dtype=tf.float32),
-                            tf.ones(1, dtype=tf.float32),
-                        ),
-                        0,
+                        (tf.zeros(attention_heads - 1, dtype=tf.float32), tf.ones(1, dtype=tf.float32)), 0
                     )
                 else:
                     return tf.ones(attention_heads, dtype=tf.float32)
@@ -735,8 +652,7 @@ def check_attentions_validity(attentions):
                 # Remove Nan
                 for t in attentions:
                     self.assertLess(
-                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(),
-                        (tf.size(t) / 4).numpy(),
+                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), (tf.size(t) / 4).numpy()
                     )  # Check we don't have more than 25% nans (arbitrary)
 
                 attentions = [
@@ -744,23 +660,11 @@ def check_attentions_validity(attentions):
                 ]  # remove them (the test is less complete)
 
                 self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0)
-                self.assertNotEqual(
-                    tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(),
-                    0.0,
-                )
+                self.assertNotEqual(tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), 0.0)
                 if len(attentions) > 2:  # encoder-decodere models have only 2 layers in each modules
-                    self.assertNotEqual(
-                        tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(),
-                        0.0,
-                    )
-                self.assertAlmostEqual(
-                    tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(),
-                    0.0,
-                )
-                self.assertNotEqual(
-                    tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(),
-                    0.0,
-                )
+                    self.assertNotEqual(tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), 0.0)
+                self.assertAlmostEqual(tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), 0.0)
+                self.assertNotEqual(tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), 0.0)
 
             if model.config.is_encoder_decoder:
                 check_attentions_validity(outputs.encoder_attentions)
@@ -771,18 +675,13 @@ def check_attentions_validity(attentions):
                 check_attentions_validity(outputs.attentions)
 
     def test_hidden_states_output(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_hidden_states_output(config, inputs_dict, model_class):
             model = model_class(config)
             outputs = model(self._prepare_for_class(inputs_dict, model_class))
             expected_num_layers = getattr(
-                self.model_tester,
-                "expected_num_hidden_layers",
-                self.model_tester.num_hidden_layers + 1,
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
             )
 
             if model.config.is_encoder_decoder:
@@ -793,18 +692,12 @@ def check_hidden_states_output(config, inputs_dict, model_class):
                 self.assertEqual(len(encoder_hidden_states), expected_num_layers)
                 self.assertListEqual(
                     list(encoder_hidden_states[0].shape[-2:]),
-                    [
-                        self.model_tester.seq_length,
-                        self.model_tester.hidden_size,
-                    ],
+                    [self.model_tester.seq_length, self.model_tester.hidden_size],
                 )
                 self.assertEqual(len(decoder_hidden_states), expected_num_layers)
                 self.assertListEqual(
                     list(decoder_hidden_states[0].shape[-2:]),
-                    [
-                        self.model_tester.seq_length,
-                        self.model_tester.hidden_size,
-                    ],
+                    [self.model_tester.seq_length, self.model_tester.hidden_size],
                 )
             else:
                 hidden_states = outputs.hidden_states
@@ -812,10 +705,7 @@ def check_hidden_states_output(config, inputs_dict, model_class):
                 self.assertEqual(len(hidden_states), expected_num_layers)
                 self.assertListEqual(
                     list(hidden_states[0].shape[-2:]),
-                    [
-                        self.model_tester.seq_length,
-                        self.model_tester.hidden_size,
-                    ],
+                    [self.model_tester.seq_length, self.model_tester.hidden_size],
                 )
 
         for model_class in self.all_model_classes:
@@ -827,10 +717,7 @@ def check_hidden_states_output(config, inputs_dict, model_class):
             check_hidden_states_output(config, inputs_dict, model_class)
 
     def test_model_common_attributes(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         text_in_text_out_models = (
             get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
             + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
@@ -860,22 +747,13 @@ def test_model_common_attributes(self):
                 assert name is None
 
     def test_determinism(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
             first, second = (
-                model(
-                    self._prepare_for_class(inputs_dict, model_class),
-                    training=False,
-                )[0],
-                model(
-                    self._prepare_for_class(inputs_dict, model_class),
-                    training=False,
-                )[0],
+                model(self._prepare_for_class(inputs_dict, model_class), training=False)[0],
+                model(self._prepare_for_class(inputs_dict, model_class), training=False)[0],
             )
             out_1 = first.numpy()
             out_2 = second.numpy()
@@ -886,10 +764,7 @@ def test_determinism(self):
 
     def test_model_outputs_equivalence(self):
 
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
             tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
@@ -939,17 +814,11 @@ def recursive_check(tuple_object, dict_object):
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             check_equivalence(
-                model,
-                tuple_inputs,
-                dict_inputs,
-                {"output_hidden_states": True, "output_attentions": True},
+                model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
             )
 
     def test_inputs_embeds(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -976,10 +845,7 @@ def test_inputs_embeds(self):
             model(inputs)
 
     def test_numpy_arrays_inputs(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def prepare_numpy_arrays(inputs_dict):
             inputs_np_dict = {}
@@ -1004,10 +870,7 @@ def prepare_numpy_arrays(inputs_dict):
     def test_resize_token_embeddings(self):
         if not self.test_resize_embeddings:
             return
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def _get_word_embedding_weight(model, embedding_layer):
             embeds = getattr(embedding_layer, "weight", None)
@@ -1066,25 +929,16 @@ def _get_word_embedding_weight(model, embedding_layer):
 
                 if old_output_embeddings is not None and new_output_embeddings is not None:
                     self.assertEqual(new_output_embeddings.shape[0], assert_size)
-                    self.assertEqual(
-                        new_output_embeddings.shape[1],
-                        old_output_embeddings.shape[1],
-                    )
+                    self.assertEqual(new_output_embeddings.shape[1], old_output_embeddings.shape[1])
 
                     models_equal = True
-                    for p1, p2 in zip(
-                        old_output_embeddings.value(),
-                        new_output_embeddings.value(),
-                    ):
+                    for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
                         if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
                             models_equal = False
                     self.assertTrue(models_equal)
 
     def test_lm_head_model_random_no_beam_search_generate(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
 
         # iterate over all generative models
@@ -1111,25 +965,16 @@ def test_lm_head_model_random_no_beam_search_generate(self):
 
             # check bad words tokens language generation
             # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [
-                self._generate_random_bad_tokens(1, model),
-                self._generate_random_bad_tokens(2, model),
-            ]
+            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
             output_tokens = model.generate(
-                input_ids,
-                do_sample=True,
-                bad_words_ids=bad_words_ids,
-                num_return_sequences=2,
+                input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
             )
             # only count generated tokens
             generated_ids = output_tokens[:, input_ids.shape[-1] :]
             self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
 
     def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
         if input_ids is None:
             input_ids = inputs_dict.get("input_features", None)
@@ -1162,10 +1007,7 @@ def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
                 self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput)
 
     def test_lm_head_model_random_beam_search_generate(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
 
         for model_class in self.all_generative_model_classes:
@@ -1180,12 +1022,7 @@ def test_lm_head_model_random_beam_search_generate(self):
 
             with self.assertRaises(AssertionError):
                 # generating more sequences than having beams leads is not possible
-                model.generate(
-                    input_ids,
-                    do_sample=False,
-                    num_return_sequences=3,
-                    num_beams=2,
-                )
+                model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
 
             # num_return_sequences > 1, sample
             self._check_generated_ids(
@@ -1197,37 +1034,20 @@ def test_lm_head_model_random_beam_search_generate(self):
                 )
             )
             # num_return_sequences > 1, greedy
-            self._check_generated_ids(
-                model.generate(
-                    input_ids,
-                    do_sample=False,
-                    num_beams=2,
-                    num_return_sequences=2,
-                )
-            )
+            self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2))
 
             # check bad words tokens language generation
             # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [
-                self._generate_random_bad_tokens(1, model),
-                self._generate_random_bad_tokens(2, model),
-            ]
+            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
             output_tokens = model.generate(
-                input_ids,
-                do_sample=False,
-                bad_words_ids=bad_words_ids,
-                num_beams=2,
-                num_return_sequences=2,
+                input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
             )
             # only count generated tokens
             generated_ids = output_tokens[:, input_ids.shape[-1] :]
             self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
 
     def test_lm_head_model_beam_search_generate_dict_outputs(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict.get("input_ids", None)
         if input_ids is None:
             input_ids = inputs_dict.get("input_features", None)
@@ -1262,20 +1082,14 @@ def test_lm_head_model_beam_search_generate_dict_outputs(self):
                 self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput)
 
     def test_loss_computation(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config)
             if getattr(model, "hf_compute_loss", None):
                 # The number of elements in the loss should be the same as the number of elements in the label
                 prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
                 added_label = prepared_for_class[
-                    sorted(
-                        list(prepared_for_class.keys() - inputs_dict.keys()),
-                        reverse=True,
-                    )[0]
+                    sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]
                 ]
                 loss_size = tf.size(added_label)
 
@@ -1286,11 +1100,7 @@ def test_loss_computation(self):
 
                 # Test that model correctly compute the loss with kwargs
                 prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                possible_input_names = {
-                    "input_ids",
-                    "pixel_values",
-                    "input_features",
-                }
+                possible_input_names = {"input_ids", "pixel_values", "input_features"}
                 input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
                 model_input = prepared_for_class.pop(input_name)
 
@@ -1334,15 +1144,8 @@ def test_loss_computation(self):
                 self.assertEqual(loss.shape, [loss_size])
 
     def test_generate_with_headmasking(self):
-        attention_names = [
-            "encoder_attentions",
-            "decoder_attentions",
-            "cross_attentions",
-        ]
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_generative_model_classes:
             model = model_class(config)
@@ -1377,10 +1180,7 @@ def test_generate_with_headmasking(self):
     def test_load_with_mismatched_shapes(self):
         if not self.test_mismatched_shapes:
             return
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
@@ -1487,13 +1287,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
 def random_attention_mask(shape, rng=None, name=None, dtype=None):
     attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype)
     # make sure that at least one token is attended to for each batch
-    attn_mask = tf.concat(
-        [
-            tf.constant(value=1, shape=(shape[0], 1), dtype=dtype),
-            attn_mask[:, 1:],
-        ],
-        axis=1,
-    )
+    attn_mask = tf.concat([tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), attn_mask[:, 1:]], axis=1)
     return attn_mask
 
 
@@ -1510,10 +1304,7 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
     for _ in range(total_dims):
         values.append(rng.random() * scale)
 
-    return tf.reshape(
-        tf.constant(values, dtype=dtype if dtype is not None else tf.float32),
-        shape=shape,
-    )
+    return tf.reshape(tf.constant(values, dtype=dtype if dtype is not None else tf.float32), shape=shape)
 
 
 @require_tf
@@ -1592,34 +1383,12 @@ def test_top_k_top_p_filtering(self):
         )
 
         non_inf_expected_idx = tf.convert_to_tensor(
-            [
-                [0, 0],
-                [0, 9],
-                [0, 10],
-                [0, 25],
-                [0, 26],
-                [1, 13],
-                [1, 17],
-                [1, 18],
-                [1, 20],
-                [1, 27],
-            ],
+            [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]],
             dtype=tf.int32,
         )  # expected non filtered idx as noted above
 
         non_inf_expected_output = tf.convert_to_tensor(
-            [
-                8.222099,
-                7.3534126,
-                8.432078,
-                7.4402075,
-                9.38451,
-                6.271159,
-                8.827531,
-                5.4402995,
-                7.3857956,
-                9.677023,
-            ],
+            [8.222099, 7.3534126, 8.432078, 7.4402075, 9.38451, 6.271159, 8.827531, 5.4402995, 7.3857956, 9.677023],
             dtype=tf.float32,
         )  # expected non filtered values as noted above
 
@@ -1650,31 +1419,19 @@ def tearDownClass(cls):
             pass
 
         try:
-            delete_repo(
-                token=cls._token,
-                name="test-model-tf-org",
-                organization="valid_org",
-            )
+            delete_repo(token=cls._token, name="test-model-tf-org", organization="valid_org")
         except HTTPError:
             pass
 
     def test_push_to_hub(self):
         config = BertConfig(
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
         )
         model = TFBertModel(config)
         # Make sure model is properly initialized
         _ = model(model.dummy_inputs)
         with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                os.path.join(tmp_dir, "test-model-tf"),
-                push_to_hub=True,
-                use_auth_token=self._token,
-            )
+            model.save_pretrained(os.path.join(tmp_dir, "test-model-tf"), push_to_hub=True, use_auth_token=self._token)
 
             new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
             models_equal = True
@@ -1685,11 +1442,7 @@ def test_push_to_hub(self):
 
     def test_push_to_hub_with_model_card(self):
         config = BertConfig(
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
         )
         model = TFBertModel(config)
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -1698,11 +1451,7 @@ def test_push_to_hub_with_model_card(self):
 
     def test_push_to_hub_in_organization(self):
         config = BertConfig(
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
         )
         model = TFBertModel(config)
         with tempfile.TemporaryDirectory() as tmp_dir:

From de00fb288a07d64a7e15b657a4d0a343a2d38602 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Fri, 25 Feb 2022 11:36:25 -0500
Subject: [PATCH 65/65] Styling nits

---
 .../models/convnext/modeling_tf_convnext.py   | 15 ++-----
 tests/convnext/test_modeling_tf_convnext.py   | 45 +++----------------
 2 files changed, 10 insertions(+), 50 deletions(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index c6e7f1311e532..fbb436059340f 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -141,15 +141,9 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs):
         # Using `layers.Activation` instead of `tf.identity` to better control `training`
         # behaviour.
         self.drop_path = (
-            TFConvNextDropPath(
-                drop_path,
-                name="drop_path",
-            )
+            TFConvNextDropPath(drop_path, name="drop_path")
             if drop_path > 0.0
-            else tf.keras.layers.Activation(
-                "linear",
-                name="drop_path",
-            )
+            else tf.keras.layers.Activation("linear", name="drop_path")
         )
 
     def build(self, input_shape: tf.TensorShape):
@@ -275,10 +269,7 @@ def call(self, hidden_states, output_hidden_states=False, return_dict=True):
         if not return_dict:
             return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
 
-        return TFBaseModelOutput(
-            last_hidden_state=hidden_states,
-            hidden_states=all_hidden_states,
-        )
+        return TFBaseModelOutput(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
 
 
 @keras_serializable
diff --git a/tests/convnext/test_modeling_tf_convnext.py b/tests/convnext/test_modeling_tf_convnext.py
index f5ccd1438af03..880e006f1abf2 100644
--- a/tests/convnext/test_modeling_tf_convnext.py
+++ b/tests/convnext/test_modeling_tf_convnext.py
@@ -73,14 +73,7 @@ def __init__(
         self.scope = scope
 
     def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [
-                self.batch_size,
-                self.num_channels,
-                self.image_size,
-                self.image_size,
-            ]
-        )
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
         labels = None
         if self.use_labels:
@@ -107,22 +100,14 @@ def create_and_check_model(self, config, pixel_values, labels):
         # expected last hidden states: B, C, H // 32, W // 32
         self.parent.assertEqual(
             result.last_hidden_state.shape,
-            (
-                self.batch_size,
-                self.hidden_sizes[-1],
-                self.image_size // 32,
-                self.image_size // 32,
-            ),
+            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
         )
 
     def create_and_check_for_image_classification(self, config, pixel_values, labels):
         config.num_labels = self.type_sequence_label_size
         model = TFConvNextForImageClassification(config)
         result = model(pixel_values, labels=labels, training=False)
-        self.parent.assertEqual(
-            result.logits.shape,
-            (self.batch_size, self.type_sequence_label_size),
-        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -138,14 +123,7 @@ class TFConvNextModelTest(TFModelTesterMixin, unittest.TestCase):
     attention_mask and seq_length.
     """
 
-    all_model_classes = (
-        (
-            TFConvNextModel,
-            TFConvNextForImageClassification,
-        )
-        if is_tf_available()
-        else ()
-    )
+    all_model_classes = (TFConvNextModel, TFConvNextForImageClassification) if is_tf_available() else ()
 
     test_pruning = False
     test_onnx = False
@@ -202,16 +180,10 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             # ConvNext's feature maps are of shape (batch_size, num_channels, height, width)
             self.assertListEqual(
                 list(hidden_states[0].shape[-2:]),
-                [
-                    self.model_tester.image_size // 4,
-                    self.model_tester.image_size // 4,
-                ],
+                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
             )
 
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             inputs_dict["output_hidden_states"] = True
@@ -225,10 +197,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
     # Since ConvNext does not have any attention we need to rewrite this test.
     def test_model_outputs_equivalence(self):
-        (
-            config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
             tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)