diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index ed04cad3dd9bf0..82053b11effdda 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -259,7 +259,7 @@ Flax), PyTorch, and/or TensorFlow. | mBART | ✅ | ✅ | ✅ | ✅ | ✅ | | Megatron-BERT | ❌ | ❌ | ✅ | ❌ | ❌ | | MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| MobileViT | ❌ | ❌ | ✅ | ❌ | ❌ | +| MobileViT | ❌ | ❌ | ✅ | ✅ | ❌ | | MPNet | ✅ | ✅ | ✅ | ✅ | ❌ | | MT5 | ✅ | ✅ | ✅ | ✅ | ✅ | | MVP | ✅ | ✅ | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/mobilevit.mdx b/docs/source/en/model_doc/mobilevit.mdx index f5fd403fd59eda..5725bd5ce5835f 100644 --- a/docs/source/en/model_doc/mobilevit.mdx +++ b/docs/source/en/model_doc/mobilevit.mdx @@ -22,12 +22,40 @@ The abstract from the paper is the following: Tips: -- MobileViT is more like a CNN than a Transformer model. It does not work on sequence data but on batches of images. Unlike ViT, there are no embeddings. The backbone model outputs a feature map. +- MobileViT is more like a CNN than a Transformer model. It does not work on sequence data but on batches of images. Unlike ViT, there are no embeddings. The backbone model outputs a feature map. You can follow [this tutorial](https://keras.io/examples/vision/mobilevit) for a lightweight introduction. - One can use [`MobileViTFeatureExtractor`] to prepare images for the model. Note that if you do your own preprocessing, the pretrained checkpoints expect images to be in BGR pixel order (not RGB). - The available image classification checkpoints are pre-trained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k) (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes). - The segmentation model uses a [DeepLabV3](https://arxiv.org/abs/1706.05587) head. The available semantic segmentation checkpoints are pre-trained on [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/). +- As the name suggests MobileViT was desgined to be performant and efficient on mobile phones. The TensorFlow versions of the MobileViT models are fully compatible with [TensorFlow Lite](https://www.tensorflow.org/lite). -This model was contributed by [matthijs](https://huggingface.co/Matthijs). The original code and weights can be found [here](https://github.com/apple/ml-cvnets). + You can use the following code to convert a MobileViT checkpoint (be it image classification or semantic segmentation) to generate a + TensorFlow Lite model: + +```py +from transformers import TFMobileViTForImageClassification +import tensorflow as tf + + +model_ckpt = "apple/mobilevit-xx-small" +model = TFMobileViTForImageClassification.from_pretrained(model_ckpt) + +converter = tf.lite.TFLiteConverter.from_keras_model(model) +converter.optimizations = [tf.lite.Optimize.DEFAULT] +converter.target_spec.supported_ops = [ + tf.lite.OpsSet.TFLITE_BUILTINS, + tf.lite.OpsSet.SELECT_TF_OPS, +] +tflite_model = converter.convert() +tflite_filename = model_ckpt.split("/")[-1] + ".tflite" +with open(tflite_filename, "wb") as f: + f.write(tflite_model) +``` + + The resulting model will be just **about an MB** making it a good fit for mobile applications where resources and network + bandwidth can be constrained. + + +This model was contributed by [matthijs](https://huggingface.co/Matthijs). The TensorFlow version of the model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code and weights can be found [here](https://github.com/apple/ml-cvnets). ## MobileViTConfig @@ -53,3 +81,18 @@ This model was contributed by [matthijs](https://huggingface.co/Matthijs). The o [[autodoc]] MobileViTForSemanticSegmentation - forward + +## TFMobileViTModel + +[[autodoc]] TFMobileViTModel + - call + +## TFMobileViTForImageClassification + +[[autodoc]] TFMobileViTForImageClassification + - call + +## TFMobileViTForSemanticSegmentation + +[[autodoc]] TFMobileViTForSemanticSegmentation + - call diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index aff905b97ec5d7..8be2f73b9e7a55 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -2398,6 +2398,15 @@ "TFMobileBertPreTrainedModel", ] ) + _import_structure["models.mobilevit"].extend( + [ + "TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFMobileViTPreTrainedModel", + "TFMobileViTModel", + "TFMobileViTForImageClassification", + "TFMobileViTForSemanticSegmentation", + ] + ) _import_structure["models.mpnet"].extend( [ "TF_MPNET_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -4847,6 +4856,7 @@ from .models.mbart import TFMBartForConditionalGeneration, TFMBartModel, TFMBartPreTrainedModel from .models.mobilebert import ( TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST, + TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST, TFMobileBertForMaskedLM, TFMobileBertForMultipleChoice, TFMobileBertForNextSentencePrediction, @@ -4857,6 +4867,10 @@ TFMobileBertMainLayer, TFMobileBertModel, TFMobileBertPreTrainedModel, + TFMobileViTForImageClassification, + TFMobileViTForSemanticSegmentation, + TFMobileViTModel, + TFMobileViTPreTrainedModel, ) from .models.mpnet import ( TF_MPNET_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/modeling_tf_outputs.py b/src/transformers/modeling_tf_outputs.py index a1d3df074fe780..efb2412084a756 100644 --- a/src/transformers/modeling_tf_outputs.py +++ b/src/transformers/modeling_tf_outputs.py @@ -685,6 +685,37 @@ class TFSemanticSegmenterOutput(ModelOutput): attentions: Optional[Tuple[tf.Tensor]] = None +@dataclass +class TFSemanticSegmenterOutputWithNoAttention(ModelOutput): + """ + Base class for outputs of semantic segmentation models that do not output attention scores. + + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`): + Classification scores for each pixel. + + + + The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is + to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the + original image size as post-processing. You should always check your logits shape and resize as needed. + + + + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each layer) of shape `(batch_size, patch_size, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + + @dataclass class TFImageClassifierOutput(ModelOutput): """ diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py index 991bb79a6b3719..a12f6accdcaeee 100644 --- a/src/transformers/models/auto/modeling_tf_auto.py +++ b/src/transformers/models/auto/modeling_tf_auto.py @@ -59,6 +59,7 @@ ("marian", "TFMarianModel"), ("mbart", "TFMBartModel"), ("mobilebert", "TFMobileBertModel"), + ("mobilevit", "TFMobileViTModel"), ("mpnet", "TFMPNetModel"), ("mt5", "TFMT5Model"), ("openai-gpt", "TFOpenAIGPTModel"), @@ -182,6 +183,7 @@ ("convnext", "TFConvNextForImageClassification"), ("data2vec-vision", "TFData2VecVisionForImageClassification"), ("deit", ("TFDeiTForImageClassification", "TFDeiTForImageClassificationWithTeacher")), + ("mobilevit", "TFMobileViTForImageClassification"), ("regnet", "TFRegNetForImageClassification"), ("resnet", "TFResNetForImageClassification"), ("segformer", "TFSegformerForImageClassification"), @@ -194,6 +196,7 @@ [ # Model for Semantic Segmentation mapping ("data2vec-vision", "TFData2VecVisionForSemanticSegmentation"), + ("mobilevit", "TFMobileViTForSemanticSegmentation"), ("segformer", "TFSegformerForSemanticSegmentation"), ] ) diff --git a/src/transformers/models/mobilevit/__init__.py b/src/transformers/models/mobilevit/__init__.py index cd639f50323c4f..e1e088f693ba24 100644 --- a/src/transformers/models/mobilevit/__init__.py +++ b/src/transformers/models/mobilevit/__init__.py @@ -17,7 +17,13 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_tf_available, + is_torch_available, + is_vision_available, +) _import_structure = { @@ -46,6 +52,19 @@ "MobileViTPreTrainedModel", ] +try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_tf_mobilevit"] = [ + "TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFMobileViTForImageClassification", + "TFMobileViTForSemanticSegmentation", + "TFMobileViTModel", + "TFMobileViTPreTrainedModel", + ] if TYPE_CHECKING: from .configuration_mobilevit import MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTConfig, MobileViTOnnxConfig @@ -72,6 +91,28 @@ MobileViTPreTrainedModel, ) + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .feature_extraction_mobilevit import MobileViTFeatureExtractor + + try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_tf_mobilevit import ( + TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST, + TFMobileViTForImageClassification, + TFMobileViTForSemanticSegmentation, + TFMobileViTModel, + TFMobileViTPreTrainedModel, + ) + else: import sys diff --git a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py new file mode 100644 index 00000000000000..c54bd6554dc35d --- /dev/null +++ b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py @@ -0,0 +1,1152 @@ +# coding=utf-8 +# Copyright 2022 Apple Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Original license: https://github.com/apple/ml-cvnets/blob/main/LICENSE +""" TensorFlow 2.0 MobileViT model.""" + +from typing import Dict, Optional, Tuple, Union + +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...file_utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPooling, + TFImageClassifierOutputWithNoAttention, + TFSemanticSegmenterOutputWithNoAttention, +) +from ...modeling_tf_utils import TFPreTrainedModel, TFSequenceClassificationLoss, keras_serializable, unpack_inputs +from ...tf_utils import shape_list, stable_softmax +from ...utils import logging +from .configuration_mobilevit import MobileViTConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "MobileViTConfig" +_FEAT_EXTRACTOR_FOR_DOC = "MobileViTFeatureExtractor" + +# Base docstring +_CHECKPOINT_FOR_DOC = "apple/mobilevit-small" +_EXPECTED_OUTPUT_SHAPE = [1, 640, 8, 8] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "apple/mobilevit-small" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" + + +TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "apple/mobilevit-small", + "apple/mobilevit-x-small", + "apple/mobilevit-xx-small", + "apple/deeplabv3-mobilevit-small", + "apple/deeplabv3-mobilevit-x-small", + "apple/deeplabv3-mobilevit-xx-small", + # See all MobileViT models at https://huggingface.co/models?filter=mobilevit +] + + +def make_divisible(value: int, divisor: int = 8, min_value: Optional[int] = None) -> int: + """ + Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the + original TensorFlow repo. It can be seen here: + https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py + """ + if min_value is None: + min_value = divisor + new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_value < 0.9 * value: + new_value += divisor + return int(new_value) + + +class TFMobileViTConvLayer(tf.keras.layers.Layer): + def __init__( + self, + config: MobileViTConfig, + out_channels: int, + kernel_size: int, + stride: int = 1, + groups: int = 1, + bias: bool = False, + dilation: int = 1, + use_normalization: bool = True, + use_activation: Union[bool, str] = True, + **kwargs + ) -> None: + super().__init__(**kwargs) + logger.warning( + f"\n{self.__class__.__name__} has backpropagation operations that are NOT supported on CPU. If you wish " + "to train/fine-tine this model, you need a GPU or a TPU" + ) + + padding = int((kernel_size - 1) / 2) * dilation + self.padding = tf.keras.layers.ZeroPadding2D(padding) + + if out_channels % groups != 0: + raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.") + + self.convolution = tf.keras.layers.Conv2D( + filters=out_channels, + kernel_size=kernel_size, + strides=stride, + padding="VALID", + dilation_rate=dilation, + groups=groups, + use_bias=bias, + name="convolution", + ) + + if use_normalization: + self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.1, name="normalization") + else: + self.normalization = None + + if use_activation: + if isinstance(use_activation, str): + self.activation = get_tf_activation(use_activation) + elif isinstance(config.hidden_act, str): + self.activation = get_tf_activation(config.hidden_act) + else: + self.activation = config.hidden_act + else: + self.activation = None + + def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: + padded_features = self.padding(features) + features = self.convolution(padded_features) + if self.normalization is not None: + features = self.normalization(features, training=training) + if self.activation is not None: + features = self.activation(features) + return features + + +class TFMobileViTInvertedResidual(tf.keras.layers.Layer): + """ + Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381 + """ + + def __init__( + self, config: MobileViTConfig, in_channels: int, out_channels: int, stride: int, dilation: int = 1, **kwargs + ) -> None: + super().__init__(**kwargs) + expanded_channels = make_divisible(int(round(in_channels * config.expand_ratio)), 8) + + if stride not in [1, 2]: + raise ValueError(f"Invalid stride {stride}.") + + self.use_residual = (stride == 1) and (in_channels == out_channels) + + self.expand_1x1 = TFMobileViTConvLayer( + config, out_channels=expanded_channels, kernel_size=1, name="expand_1x1" + ) + + self.conv_3x3 = TFMobileViTConvLayer( + config, + out_channels=expanded_channels, + kernel_size=3, + stride=stride, + groups=expanded_channels, + dilation=dilation, + name="conv_3x3", + ) + + self.reduce_1x1 = TFMobileViTConvLayer( + config, + out_channels=out_channels, + kernel_size=1, + use_activation=False, + name="reduce_1x1", + ) + + def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: + residual = features + + features = self.expand_1x1(features, training=training) + features = self.conv_3x3(features, training=training) + features = self.reduce_1x1(features, training=training) + + return residual + features if self.use_residual else features + + +class TFMobileViTMobileNetLayer(tf.keras.layers.Layer): + def __init__( + self, + config: MobileViTConfig, + in_channels: int, + out_channels: int, + stride: int = 1, + num_stages: int = 1, + **kwargs + ) -> None: + super().__init__(**kwargs) + + self.layers = [] + for i in range(num_stages): + layer = TFMobileViTInvertedResidual( + config, + in_channels=in_channels, + out_channels=out_channels, + stride=stride if i == 0 else 1, + name=f"layer.{i}", + ) + self.layers.append(layer) + in_channels = out_channels + + def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: + for layer_module in self.layers: + features = layer_module(features, training=training) + return features + + +class TFMobileViTSelfAttention(tf.keras.layers.Layer): + def __init__(self, config: MobileViTConfig, hidden_size: int, **kwargs) -> None: + super().__init__(**kwargs) + + if hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size {hidden_size,} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + scale = tf.cast(self.attention_head_size, dtype=tf.float32) + self.scale = tf.math.sqrt(scale) + + self.query = tf.keras.layers.Dense(self.all_head_size, use_bias=config.qkv_bias, name="query") + self.key = tf.keras.layers.Dense(self.all_head_size, use_bias=config.qkv_bias, name="key") + self.value = tf.keras.layers.Dense(self.all_head_size, use_bias=config.qkv_bias, name="value") + + self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor: + batch_size = tf.shape(x)[0] + x = tf.reshape(x, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + return tf.transpose(x, perm=[0, 2, 1, 3]) + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: + batch_size = tf.shape(hidden_states)[0] + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(self.query(hidden_states)) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + attention_scores = attention_scores / self.scale + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs, training=training) + + context_layer = tf.matmul(attention_probs, value_layer) + + context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) + context_layer = tf.reshape(context_layer, shape=(batch_size, -1, self.all_head_size)) + return context_layer + + +class TFMobileViTSelfOutput(tf.keras.layers.Layer): + def __init__(self, config: MobileViTConfig, hidden_size: int, **kwargs) -> None: + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense(hidden_size, name="dense") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + return hidden_states + + +class TFMobileViTAttention(tf.keras.layers.Layer): + def __init__(self, config: MobileViTConfig, hidden_size: int, **kwargs) -> None: + super().__init__(**kwargs) + self.attention = TFMobileViTSelfAttention(config, hidden_size, name="attention") + self.dense_output = TFMobileViTSelfOutput(config, hidden_size, name="output") + + def prune_heads(self, heads): + raise NotImplementedError + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: + self_outputs = self.attention(hidden_states, training=training) + attention_output = self.dense_output(self_outputs, training=training) + return attention_output + + +class TFMobileViTIntermediate(tf.keras.layers.Layer): + def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: int, **kwargs) -> None: + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense(intermediate_size, name="dense") + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class TFMobileViTOutput(tf.keras.layers.Layer): + def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: int, **kwargs) -> None: + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense(hidden_size, name="dense") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = hidden_states + input_tensor + return hidden_states + + +class TFMobileViTTransformerLayer(tf.keras.layers.Layer): + def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: int, **kwargs) -> None: + super().__init__(**kwargs) + self.attention = TFMobileViTAttention(config, hidden_size, name="attention") + self.intermediate = TFMobileViTIntermediate(config, hidden_size, intermediate_size, name="intermediate") + self.mobilevit_output = TFMobileViTOutput(config, hidden_size, intermediate_size, name="output") + self.layernorm_before = tf.keras.layers.LayerNormalization( + epsilon=config.layer_norm_eps, name="layernorm_before" + ) + self.layernorm_after = tf.keras.layers.LayerNormalization( + epsilon=config.layer_norm_eps, name="layernorm_after" + ) + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: + attention_output = self.attention(self.layernorm_before(hidden_states), training=training) + hidden_states = attention_output + hidden_states + + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + layer_output = self.mobilevit_output(layer_output, hidden_states, training=training) + return layer_output + + +class TFMobileViTTransformer(tf.keras.layers.Layer): + def __init__(self, config: MobileViTConfig, hidden_size: int, num_stages: int, **kwargs) -> None: + super().__init__(**kwargs) + + self.layers = [] + for i in range(num_stages): + transformer_layer = TFMobileViTTransformerLayer( + config, + hidden_size=hidden_size, + intermediate_size=int(hidden_size * config.mlp_ratio), + name=f"layer.{i}", + ) + self.layers.append(transformer_layer) + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: + for layer_module in self.layers: + hidden_states = layer_module(hidden_states, training=training) + return hidden_states + + +class TFMobileViTLayer(tf.keras.layers.Layer): + """ + MobileViT block: https://arxiv.org/abs/2110.02178 + """ + + def __init__( + self, + config: MobileViTConfig, + in_channels: int, + out_channels: int, + stride: int, + hidden_size: int, + num_stages: int, + dilation: int = 1, + **kwargs + ) -> None: + super().__init__(**kwargs) + self.patch_width = config.patch_size + self.patch_height = config.patch_size + + if stride == 2: + self.downsampling_layer = TFMobileViTInvertedResidual( + config, + in_channels=in_channels, + out_channels=out_channels, + stride=stride if dilation == 1 else 1, + dilation=dilation // 2 if dilation > 1 else 1, + name="downsampling_layer", + ) + in_channels = out_channels + else: + self.downsampling_layer = None + + self.conv_kxk = TFMobileViTConvLayer( + config, out_channels=in_channels, kernel_size=config.conv_kernel_size, name="conv_kxk" + ) + + self.conv_1x1 = TFMobileViTConvLayer( + config, + out_channels=hidden_size, + kernel_size=1, + use_normalization=False, + use_activation=False, + name="conv_1x1", + ) + + self.transformer = TFMobileViTTransformer( + config, hidden_size=hidden_size, num_stages=num_stages, name="transformer" + ) + + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + + self.conv_projection = TFMobileViTConvLayer( + config, out_channels=in_channels, kernel_size=1, name="conv_projection" + ) + + self.fusion = TFMobileViTConvLayer( + config, out_channels=in_channels, kernel_size=config.conv_kernel_size, name="fusion" + ) + + def unfolding(self, features: tf.Tensor) -> Tuple[tf.Tensor, Dict]: + patch_width, patch_height = self.patch_width, self.patch_height + patch_area = tf.cast(patch_width * patch_height, "int32") + + batch_size = tf.shape(features)[0] + orig_height = tf.shape(features)[1] + orig_width = tf.shape(features)[2] + channels = tf.shape(features)[3] + + new_height = tf.cast(tf.math.ceil(orig_height / patch_height) * patch_height, "int32") + new_width = tf.cast(tf.math.ceil(orig_width / patch_width) * patch_width, "int32") + + interpolate = new_width != orig_width or new_height != orig_height + if interpolate: + # Note: Padding can be done, but then it needs to be handled in attention function. + features = tf.image.resize(features, size=(new_height, new_width), method="bilinear") + + # number of patches along width and height + num_patch_width = new_width // patch_width + num_patch_height = new_height // patch_height + num_patches = num_patch_height * num_patch_width + + # convert from shape (batch_size, orig_height, orig_width, channels) + # to the shape (batch_size * patch_area, num_patches, channels) + features = tf.transpose(features, [0, 3, 1, 2]) + patches = tf.reshape( + features, (batch_size * channels * num_patch_height, patch_height, num_patch_width, patch_width) + ) + patches = tf.transpose(patches, [0, 2, 1, 3]) + patches = tf.reshape(patches, (batch_size, channels, num_patches, patch_area)) + patches = tf.transpose(patches, [0, 3, 2, 1]) + patches = tf.reshape(patches, (batch_size * patch_area, num_patches, channels)) + + info_dict = { + "orig_size": (orig_height, orig_width), + "batch_size": batch_size, + "channels": channels, + "interpolate": interpolate, + "num_patches": num_patches, + "num_patches_width": num_patch_width, + "num_patches_height": num_patch_height, + } + return patches, info_dict + + def folding(self, patches: tf.Tensor, info_dict: Dict) -> tf.Tensor: + patch_width, patch_height = self.patch_width, self.patch_height + patch_area = int(patch_width * patch_height) + + batch_size = info_dict["batch_size"] + channels = info_dict["channels"] + num_patches = info_dict["num_patches"] + num_patch_height = info_dict["num_patches_height"] + num_patch_width = info_dict["num_patches_width"] + + # convert from shape (batch_size * patch_area, num_patches, channels) + # back to shape (batch_size, channels, orig_height, orig_width) + features = tf.reshape(patches, (batch_size, patch_area, num_patches, -1)) + features = tf.transpose(features, perm=(0, 3, 2, 1)) + features = tf.reshape( + features, (batch_size * channels * num_patch_height, num_patch_width, patch_height, patch_width) + ) + features = tf.transpose(features, perm=(0, 2, 1, 3)) + features = tf.reshape( + features, (batch_size, channels, num_patch_height * patch_height, num_patch_width * patch_width) + ) + features = tf.transpose(features, perm=(0, 2, 3, 1)) + + if info_dict["interpolate"]: + features = tf.image.resize(features, size=info_dict["orig_size"], method="bilinear") + + return features + + def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: + # reduce spatial dimensions if needed + if self.downsampling_layer: + features = self.downsampling_layer(features, training=training) + + residual = features + + # local representation + features = self.conv_kxk(features, training=training) + features = self.conv_1x1(features, training=training) + + # convert feature map to patches + patches, info_dict = self.unfolding(features) + + # learn global representations + patches = self.transformer(patches, training=training) + patches = self.layernorm(patches) + + # convert patches back to feature maps + features = self.folding(patches, info_dict) + + features = self.conv_projection(features, training=training) + features = self.fusion(tf.concat([residual, features], axis=-1), training=training) + return features + + +class TFMobileViTEncoder(tf.keras.layers.Layer): + def __init__(self, config: MobileViTConfig, **kwargs) -> None: + super().__init__(**kwargs) + self.config = config + + self.layers = [] + + # segmentation architectures like DeepLab and PSPNet modify the strides + # of the classification backbones + dilate_layer_4 = dilate_layer_5 = False + if config.output_stride == 8: + dilate_layer_4 = True + dilate_layer_5 = True + elif config.output_stride == 16: + dilate_layer_5 = True + + dilation = 1 + + layer_1 = TFMobileViTMobileNetLayer( + config, + in_channels=config.neck_hidden_sizes[0], + out_channels=config.neck_hidden_sizes[1], + stride=1, + num_stages=1, + name="layer.0", + ) + self.layers.append(layer_1) + + layer_2 = TFMobileViTMobileNetLayer( + config, + in_channels=config.neck_hidden_sizes[1], + out_channels=config.neck_hidden_sizes[2], + stride=2, + num_stages=3, + name="layer.1", + ) + self.layers.append(layer_2) + + layer_3 = TFMobileViTLayer( + config, + in_channels=config.neck_hidden_sizes[2], + out_channels=config.neck_hidden_sizes[3], + stride=2, + hidden_size=config.hidden_sizes[0], + num_stages=2, + name="layer.2", + ) + self.layers.append(layer_3) + + if dilate_layer_4: + dilation *= 2 + + layer_4 = TFMobileViTLayer( + config, + in_channels=config.neck_hidden_sizes[3], + out_channels=config.neck_hidden_sizes[4], + stride=2, + hidden_size=config.hidden_sizes[1], + num_stages=4, + dilation=dilation, + name="layer.3", + ) + self.layers.append(layer_4) + + if dilate_layer_5: + dilation *= 2 + + layer_5 = TFMobileViTLayer( + config, + in_channels=config.neck_hidden_sizes[4], + out_channels=config.neck_hidden_sizes[5], + stride=2, + hidden_size=config.hidden_sizes[2], + num_stages=3, + dilation=dilation, + name="layer.4", + ) + self.layers.append(layer_5) + + def call( + self, + hidden_states: tf.Tensor, + output_hidden_states: bool = False, + return_dict: bool = True, + training: bool = False, + ) -> Union[tuple, TFBaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + + for i, layer_module in enumerate(self.layers): + hidden_states = layer_module(hidden_states, training=training) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) + + return TFBaseModelOutput(last_hidden_state=hidden_states, hidden_states=all_hidden_states) + + +@keras_serializable +class TFMobileViTMainLayer(tf.keras.layers.Layer): + config_class = MobileViTConfig + + def __init__(self, config: MobileViTConfig, expand_output: bool = True, **kwargs): + super().__init__(**kwargs) + self.config = config + self.expand_output = expand_output + + self.conv_stem = TFMobileViTConvLayer( + config, + out_channels=config.neck_hidden_sizes[0], + kernel_size=3, + stride=2, + name="conv_stem", + ) + + self.encoder = TFMobileViTEncoder(config, name="encoder") + + if self.expand_output: + self.conv_1x1_exp = TFMobileViTConvLayer( + config, out_channels=config.neck_hidden_sizes[6], kernel_size=1, name="conv_1x1_exp" + ) + + self.pooler = tf.keras.layers.GlobalAveragePooling2D(data_format="channels_first", name="pooler") + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError + + @unpack_inputs + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[Tuple[tf.Tensor], TFBaseModelOutputWithPooling]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # So change the input format from `NCHW` to `NHWC`. + # shape = (batch_size, in_height, in_width, in_channels=num_channels) + pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) + + embedding_output = self.conv_stem(pixel_values, training=training) + + encoder_outputs = self.encoder( + embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict, training=training + ) + + if self.expand_output: + last_hidden_state = self.conv_1x1_exp(encoder_outputs[0]) + + # Change to NCHW output format to have uniformity in the modules + last_hidden_state = tf.transpose(last_hidden_state, perm=[0, 3, 1, 2]) + + # global average pooling: (batch_size, channels, height, width) -> (batch_size, channels) + pooled_output = self.pooler(last_hidden_state) + else: + last_hidden_state = encoder_outputs[0] + # Change to NCHW output format to have uniformity in the modules + last_hidden_state = tf.transpose(last_hidden_state, perm=[0, 3, 1, 2]) + pooled_output = None + + if not return_dict: + output = (last_hidden_state, pooled_output) if pooled_output is not None else (last_hidden_state,) + + # Change to NCHW output format to have uniformity in the modules + if not self.expand_output: + remaining_encoder_outputs = encoder_outputs[1:] + remaining_encoder_outputs = tuple( + [tf.transpose(h, perm=(0, 3, 1, 2)) for h in remaining_encoder_outputs[0]] + ) + remaining_encoder_outputs = (remaining_encoder_outputs,) + return output + remaining_encoder_outputs + else: + return output + encoder_outputs[1:] + + # Change the other hidden state outputs to NCHW as well + if output_hidden_states: + hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]]) + + return TFBaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, + ) + + +class TFMobileViTPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = MobileViTConfig + base_model_prefix = "mobilevit" + main_input_name = "pixel_values" + + @property + def dummy_inputs(self) -> Dict[str, tf.Tensor]: + """ + Dummy inputs to build the network. + + Returns: + `Dict[str, tf.Tensor]`: The dummy inputs. + """ + VISION_DUMMY_INPUTS = tf.random.uniform( + shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size), + dtype=tf.float32, + ) + return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)} + + @tf.function( + input_signature=[ + { + "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"), + } + ] + ) + def serving(self, inputs): + """ + Method used for serving the model. + + Args: + inputs (`Dict[str, tf.Tensor]`): + The input of the saved model as a dictionary of tensors. + """ + output = self.call(inputs) + return self.serving_output(output) + + +MOBILEVIT_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TF 2.0 models accepts two formats as inputs: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional arguments. + + This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the + tensors in the first argument of the model call function: `model(inputs)`. + + + + Parameters: + config ([`MobileViTConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +MOBILEVIT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`MobileViTFeatureExtractor`]. See + [`MobileViTFeatureExtractor.__call__`] for details. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. +""" + + +@add_start_docstrings( + "The bare MobileViT model outputting raw hidden-states without any specific head on top.", + MOBILEVIT_START_DOCSTRING, +) +class TFMobileViTModel(TFMobileViTPreTrainedModel): + def __init__(self, config: MobileViTConfig, expand_output: bool = True, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.config = config + self.expand_output = expand_output + + self.mobilevit = TFMobileViTMainLayer(config, expand_output=expand_output, name="mobilevit") + + @unpack_inputs + @add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[Tuple[tf.Tensor], TFBaseModelOutputWithPooling]: + + output = self.mobilevit(pixel_values, output_hidden_states, return_dict, training=training) + return output + + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: + # hidden_states not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions + return TFBaseModelOutputWithPooling( + last_hidden_state=output.last_hidden_state, + pooler_output=output.pooler_output, + hidden_states=output.hidden_states, + ) + + +@add_start_docstrings( + """ + MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for + ImageNet. + """, + MOBILEVIT_START_DOCSTRING, +) +class TFMobileViTForImageClassification(TFMobileViTPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: MobileViTConfig, *inputs, **kwargs) -> None: + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + self.mobilevit = TFMobileViTMainLayer(config, name="mobilevit") + + # Classifier head + self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob) + self.classifier = ( + tf.keras.layers.Dense(config.num_labels, name="classifier") if config.num_labels > 0 else tf.identity + ) + + @unpack_inputs + @add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=TFImageClassifierOutputWithNoAttention, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[tf.Tensor] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + ) -> Union[tuple, TFImageClassifierOutputWithNoAttention]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.mobilevit( + pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict, training=training + ) + + pooled_output = outputs.pooler_output if return_dict else outputs[1] + + logits = self.classifier(self.dropout(pooled_output, training=training)) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) + + def serving_output(self, output: TFImageClassifierOutputWithNoAttention) -> TFImageClassifierOutputWithNoAttention: + # hidden_states and attention not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions + return TFImageClassifierOutputWithNoAttention(logits=output.logits, hidden_states=output.hidden_states) + + +class TFMobileViTASPPPooling(tf.keras.layers.Layer): + def __init__(self, config: MobileViTConfig, out_channels: int, **kwargs) -> None: + super().__init__(**kwargs) + + self.global_pool = tf.keras.layers.GlobalAveragePooling2D(keepdims=True, name="global_pool") + + self.conv_1x1 = TFMobileViTConvLayer( + config, + out_channels=out_channels, + kernel_size=1, + stride=1, + use_normalization=True, + use_activation="relu", + name="conv_1x1", + ) + + def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: + spatial_size = shape_list(features)[1:-1] + features = self.global_pool(features) + features = self.conv_1x1(features, training=training) + features = tf.image.resize(features, size=spatial_size, method="bilinear") + return features + + +class TFMobileViTASPP(tf.keras.layers.Layer): + """ + ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587 + """ + + def __init__(self, config: MobileViTConfig, **kwargs) -> None: + super().__init__(**kwargs) + + out_channels = config.aspp_out_channels + + if len(config.atrous_rates) != 3: + raise ValueError("Expected 3 values for atrous_rates") + + self.convs = [] + + in_projection = TFMobileViTConvLayer( + config, + out_channels=out_channels, + kernel_size=1, + use_activation="relu", + name="convs.0", + ) + self.convs.append(in_projection) + + self.convs.extend( + [ + TFMobileViTConvLayer( + config, + out_channels=out_channels, + kernel_size=3, + dilation=rate, + use_activation="relu", + name=f"convs.{i + 1}", + ) + for i, rate in enumerate(config.atrous_rates) + ] + ) + + pool_layer = TFMobileViTASPPPooling(config, out_channels, name=f"convs.{len(config.atrous_rates) + 1}") + self.convs.append(pool_layer) + + self.project = TFMobileViTConvLayer( + config, + out_channels=out_channels, + kernel_size=1, + use_activation="relu", + name="project", + ) + + self.dropout = tf.keras.layers.Dropout(config.aspp_dropout_prob) + + def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: + # since the hidden states were transposed to have `(batch_size, channels, height, width)` + # layout we transpose them back to have `(batch_size, height, width, channels)` layout. + features = tf.transpose(features, perm=[0, 2, 3, 1]) + pyramid = [] + for conv in self.convs: + pyramid.append(conv(features, training=training)) + pyramid = tf.concat(pyramid, axis=-1) + + pooled_features = self.project(pyramid, training=training) + pooled_features = self.dropout(pooled_features, training=training) + return pooled_features + + +class TFMobileViTDeepLabV3(tf.keras.layers.Layer): + """ + DeepLabv3 architecture: https://arxiv.org/abs/1706.05587 + """ + + def __init__(self, config: MobileViTConfig, **kwargs) -> None: + super().__init__(**kwargs) + self.aspp = TFMobileViTASPP(config, name="aspp") + + self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob) + + self.classifier = TFMobileViTConvLayer( + config, + out_channels=config.num_labels, + kernel_size=1, + use_normalization=False, + use_activation=False, + bias=True, + name="classifier", + ) + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: + features = self.aspp(hidden_states[-1], training=training) + features = self.dropout(features, training=training) + features = self.classifier(features, training=training) + return features + + +@add_start_docstrings( + """ + MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC. + """, + MOBILEVIT_START_DOCSTRING, +) +class TFMobileViTForSemanticSegmentation(TFMobileViTPreTrainedModel): + def __init__(self, config: MobileViTConfig, **kwargs) -> None: + super().__init__(config, **kwargs) + + self.num_labels = config.num_labels + self.mobilevit = TFMobileViTMainLayer(config, expand_output=False, name="mobilevit") + self.segmentation_head = TFMobileViTDeepLabV3(config, name="segmentation_head") + + def hf_compute_loss(self, logits, labels): + # upsample logits to the images' original size + # `labels` is of shape (batch_size, height, width) + label_interp_shape = shape_list(labels)[1:] + + upsampled_logits = tf.image.resize(logits, size=label_interp_shape, method="bilinear") + # compute weighted loss + loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none") + + def masked_loss(real, pred): + unmasked_loss = loss_fct(real, pred) + mask = tf.cast(real != self.config.semantic_loss_ignore_index, dtype=unmasked_loss.dtype) + masked_loss = unmasked_loss * mask + # Reduction strategy in the similar spirit with + # https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_utils.py#L210 + reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(mask) + return tf.reshape(reduced_masked_loss, (1,)) + + return masked_loss(labels, upsampled_logits) + + @unpack_inputs + @add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSemanticSegmenterOutputWithNoAttention, config_class=_CONFIG_FOR_DOC) + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + labels: Optional[tf.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[tuple, TFSemanticSegmenterOutputWithNoAttention]: + r""" + labels (`tf.Tensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy). + + Returns: + + Examples: + + ```python + >>> from transformers import MobileViTFeatureExtractor, TFMobileViTForSemanticSegmentation + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/deeplabv3-mobilevit-small") + >>> model = TFMobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small") + + >>> inputs = feature_extractor(images=image, return_tensors="tf") + + >>> outputs = model(**inputs) + + >>> # logits are of shape (batch_size, num_labels, height, width) + >>> logits = outputs.logits + ```""" + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.mobilevit( + pixel_values, + output_hidden_states=True, # we need the intermediate hidden states + return_dict=return_dict, + training=training, + ) + + encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1] + + logits = self.segmentation_head(encoder_hidden_states, training=training) + + loss = None + if labels is not None: + if not self.config.num_labels > 1: + raise ValueError("The number of labels should be greater than one") + else: + loss = self.hf_compute_loss(logits=logits, labels=labels) + + # make logits of shape (batch_size, num_labels, height, width) to + # keep them consistent across APIs + logits = tf.transpose(logits, perm=[0, 3, 1, 2]) + + if not return_dict: + if output_hidden_states: + output = (logits,) + outputs[1:] + else: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFSemanticSegmenterOutputWithNoAttention( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states if output_hidden_states else None, + ) + + def serving_output( + self, output: TFSemanticSegmenterOutputWithNoAttention + ) -> TFSemanticSegmenterOutputWithNoAttention: + return TFSemanticSegmenterOutputWithNoAttention(logits=output.logits, hidden_states=output.hidden_states) diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py index e09d26aec5c96b..bc3eb64ca46dab 100644 --- a/src/transformers/utils/dummy_tf_objects.py +++ b/src/transformers/utils/dummy_tf_objects.py @@ -1524,6 +1524,9 @@ def __init__(self, *args, **kwargs): TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None +TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None + + class TFMobileBertForMaskedLM(metaclass=DummyObject): _backends = ["tf"] @@ -1594,6 +1597,34 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["tf"]) +class TFMobileViTForImageClassification(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFMobileViTForSemanticSegmentation(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFMobileViTModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFMobileViTPreTrainedModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + TF_MPNET_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/tests/models/mobilevit/test_modeling_tf_mobilevit.py b/tests/models/mobilevit/test_modeling_tf_mobilevit.py new file mode 100644 index 00000000000000..d46ee895ed71f5 --- /dev/null +++ b/tests/models/mobilevit/test_modeling_tf_mobilevit.py @@ -0,0 +1,420 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the TensorFlow MobileViT model. """ + + +import inspect +import unittest + +from transformers import MobileViTConfig +from transformers.file_utils import is_tf_available, is_vision_available +from transformers.testing_utils import require_tf, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor + + +if is_tf_available(): + import numpy as np + import tensorflow as tf + + from transformers import TFMobileViTForImageClassification, TFMobileViTForSemanticSegmentation, TFMobileViTModel + from transformers.models.mobilevit.modeling_tf_mobilevit import TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST + + +if is_vision_available(): + from PIL import Image + + from transformers import MobileViTFeatureExtractor + + +class TFMobileViTConfigTester(ConfigTester): + def create_and_test_config_common_properties(self): + config = self.config_class(**self.inputs_dict) + self.parent.assertTrue(hasattr(config, "hidden_sizes")) + self.parent.assertTrue(hasattr(config, "neck_hidden_sizes")) + self.parent.assertTrue(hasattr(config, "num_attention_heads")) + + +class TFMobileViTModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=32, + patch_size=2, + num_channels=3, + last_hidden_size=640, + num_attention_heads=4, + hidden_act="silu", + conv_kernel_size=3, + output_stride=32, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + classifier_dropout_prob=0.1, + initializer_range=0.02, + is_training=True, + use_labels=True, + num_labels=10, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.last_hidden_size = last_hidden_size + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.conv_kernel_size = conv_kernel_size + self.output_stride = output_stride + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.classifier_dropout_prob = classifier_dropout_prob + self.use_labels = use_labels + self.is_training = is_training + self.num_labels = num_labels + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + pixel_labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.num_labels) + pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels, pixel_labels + + def get_config(self): + return MobileViTConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + num_attention_heads=self.num_attention_heads, + hidden_act=self.hidden_act, + conv_kernel_size=self.conv_kernel_size, + output_stride=self.output_stride, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + classifier_dropout_prob=self.classifier_dropout_prob, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values, labels, pixel_labels): + model = TFMobileViTModel(config=config) + result = model(pixel_values, training=False) + expected_height = expected_width = self.image_size // self.output_stride + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.last_hidden_size, expected_height, expected_width) + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels): + config.num_labels = self.num_labels + model = TFMobileViTForImageClassification(config) + result = model(pixel_values, labels=labels, training=False) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels, pixel_labels): + config.num_labels = self.num_labels + model = TFMobileViTForSemanticSegmentation(config) + expected_height = expected_width = self.image_size // self.output_stride + + result = model(pixel_values, training=False) + self.parent.assertEqual( + result.logits.shape, (self.batch_size, self.num_labels, expected_height, expected_width) + ) + + result = model(pixel_values, labels=pixel_labels, training=False) + self.parent.assertEqual( + result.logits.shape, (self.batch_size, self.num_labels, expected_height, expected_width) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels, pixel_labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_tf +class MobileViTModelTest(TFModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as MobileViT does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = ( + (TFMobileViTModel, TFMobileViTForImageClassification, TFMobileViTForSemanticSegmentation) + if is_tf_available() + else () + ) + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + test_onnx = False + + def setUp(self): + self.model_tester = TFMobileViTModelTester(self) + self.config_tester = TFMobileViTConfigTester(self, config_class=MobileViTConfig, has_text_modality=False) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="MobileViT does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="MobileViT does not support input and output embeddings") + def test_model_common_attributes(self): + pass + + @unittest.skip(reason="MobileViT does not output attentions") + def test_attention_outputs(self): + pass + + @unittest.skip("Test was written for TF 1.x and isn't really relevant here") + def test_compile_tf_model(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_stages = 5 + self.assertEqual(len(hidden_states), expected_num_stages) + + # MobileViT's feature maps are of shape (batch_size, num_channels, height, width) + # with the width and height being successively divided by 2. + divisor = 2 + for i in range(len(hidden_states)): + self.assertListEqual( + list(hidden_states[i].shape[-2:]), + [self.model_tester.image_size // divisor, self.model_tester.image_size // divisor], + ) + divisor *= 2 + + self.assertEqual(self.model_tester.output_stride, divisor // 2) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + def test_for_semantic_segmentation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs) + + @unittest.skipIf( + not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0, + reason="TF does not support backprop for grouped convolutions on CPU.", + ) + def test_dataset_conversion(self): + super().test_dataset_conversion() + + def check_keras_fit_results(self, val_loss1, val_loss2, atol=2e-1, rtol=2e-1): + self.assertTrue(np.allclose(val_loss1, val_loss2, atol=atol, rtol=rtol)) + + @unittest.skipIf( + not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0, + reason="TF does not support backprop for grouped convolutions on CPU.", + ) + def test_keras_fit(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + # Since `TFMobileViTModel` cannot operate with the default `fit()` method. + if model_class.__name__ != "TFMobileViTModel": + model = model_class(config) + if getattr(model, "hf_compute_loss", None): + super().test_keras_fit() + + # The default test_loss_computation() uses -100 as a proxy ignore_index + # to test masked losses. Overridding to avoid -100 since semantic segmentation + # models use `semantic_loss_ignore_index` from the config. + def test_loss_computation(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + # set an ignore index to correctly test the masked loss used in + # `TFMobileViTForSemanticSegmentation`. + if model_class.__name__ != "TFMobileViTForSemanticSegmentation": + config.semantic_loss_ignore_index = 5 + + model = model_class(config) + if getattr(model, "hf_compute_loss", None): + # The number of elements in the loss should be the same as the number of elements in the label + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + added_label = prepared_for_class[ + sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0] + ] + expected_loss_size = added_label.shape.as_list()[:1] + + # Test that model correctly compute the loss with kwargs + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + possible_input_names = {"input_ids", "pixel_values", "input_features"} + input_name = possible_input_names.intersection(set(prepared_for_class)).pop() + model_input = prepared_for_class.pop(input_name) + + loss = model(model_input, **prepared_for_class)[0] + self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1]) + + # Test that model correctly compute the loss when we mask some positions + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + possible_input_names = {"input_ids", "pixel_values", "input_features"} + input_name = possible_input_names.intersection(set(prepared_for_class)).pop() + model_input = prepared_for_class.pop(input_name) + if "labels" in prepared_for_class: + labels = prepared_for_class["labels"].numpy() + if len(labels.shape) > 1 and labels.shape[1] != 1: + # labels[0] = -100 + prepared_for_class["labels"] = tf.convert_to_tensor(labels) + loss = model(model_input, **prepared_for_class)[0] + self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1]) + self.assertTrue(not np.any(np.isnan(loss.numpy()))) + + # Test that model correctly compute the loss with a dict + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + loss = model(prepared_for_class)[0] + self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1]) + + # Test that model correctly compute the loss with a tuple + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + + # Get keys that were added with the _prepare_for_class function + label_keys = prepared_for_class.keys() - inputs_dict.keys() + signature = inspect.signature(model.call).parameters + signature_names = list(signature.keys()) + + # Create a dictionary holding the location of the tensors in the tuple + tuple_index_mapping = {0: input_name} + for label_key in label_keys: + label_key_index = signature_names.index(label_key) + tuple_index_mapping[label_key_index] = label_key + sorted_tuple_index_mapping = sorted(tuple_index_mapping.items()) + # Initialize a list with their default values, update the values and convert to a tuple + list_input = [] + + for name in signature_names: + if name != "kwargs": + list_input.append(signature[name].default) + + for index, value in sorted_tuple_index_mapping: + list_input[index] = prepared_for_class[value] + + tuple_input = tuple(list_input) + + # Send to model + loss = model(tuple_input[:-1])[0] + + self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1]) + + @slow + def test_model_from_pretrained(self): + for model_name in TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = TFMobileViTModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_tf +class TFMobileViTModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_image_classification_head(self): + model = TFMobileViTForImageClassification.from_pretrained("apple/mobilevit-xx-small") + + feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/mobilevit-xx-small") + image = prepare_img() + inputs = feature_extractor(images=image, return_tensors="tf") + + # forward pass + outputs = model(**inputs, training=False) + + # verify the logits + expected_shape = tf.TensorShape((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = tf.constant([-1.9364, -1.2327, -0.4653]) + + tf.debugging.assert_near(outputs.logits[0, :3], expected_slice, atol=1e-4, rtol=1e-04) + + @slow + def test_inference_semantic_segmentation(self): + # `from_pt` will be removed + model = TFMobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-xx-small") + + feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/deeplabv3-mobilevit-xx-small") + + image = prepare_img() + inputs = feature_extractor(images=image, return_tensors="tf") + + # forward pass + outputs = model(inputs.pixel_values, training=False) + logits = outputs.logits + + # verify the logits + expected_shape = tf.TensorShape((1, 21, 32, 32)) + self.assertEqual(logits.shape, expected_shape) + + expected_slice = tf.constant( + [ + [[6.9713, 6.9786, 7.2422], [7.2893, 7.2825, 7.4446], [7.6580, 7.8797, 7.9420]], + [[-10.6869, -10.3250, -10.3471], [-10.4228, -9.9868, -9.7132], [-11.0405, -11.0221, -10.7318]], + [[-3.3089, -2.8539, -2.6740], [-3.2706, -2.5621, -2.5108], [-3.2534, -2.6615, -2.6651]], + ] + ) + + tf.debugging.assert_near(logits[0, :3, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index 74110e5ac2ebad..db1b39d120b172 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -52,6 +52,7 @@ src/transformers/models/mbart/modeling_mbart.py src/transformers/models/mobilebert/modeling_mobilebert.py src/transformers/models/mobilebert/modeling_tf_mobilebert.py src/transformers/models/mobilevit/modeling_mobilevit.py +src/transformers/models/mobilevit/modeling_tf_mobilevit.py src/transformers/models/opt/modeling_opt.py src/transformers/models/opt/modeling_tf_opt.py src/transformers/models/owlvit/modeling_owlvit.py