From d2a084835fcf46306e768827c5811614800345a4 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 8 Feb 2022 21:33:40 +0530 Subject: [PATCH 01/65] feat: initial implementation of convnext in tensorflow. --- .../models/convnext/modeling_tf_convnext.py | 504 ++++++++++++++++++ 1 file changed, 504 insertions(+) create mode 100644 src/transformers/models/convnext/modeling_tf_convnext.py diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py new file mode 100644 index 0000000000000..e67088ba6d7ea --- /dev/null +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -0,0 +1,504 @@ +# coding=utf-8 +# Copyright 2022 Meta Platforms Inc., Sayak Paul, and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TF 2.0 ConvNext model.""" + +import collections.abc +import math +from typing import Dict, Optional, Tuple, Union + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings +from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling, TFSequenceClassifierOutput +from ...modeling_tf_utils import ( + TFModelInputType, + TFPreTrainedModel, + TFSequenceClassificationLoss, + get_initializer, + input_processing, + keras_serializable, + shape_list, +) +from ...utils import logging +from .configuration_convnext import ConvNextConfig + + +logger = logging.get_logger(__name__) + + +_CONFIG_FOR_DOC = "ConvNextConfig" +_CHECKPOINT_FOR_DOC = "facebook/ConvNext-tiny-224" + + +class TFConvNextDropPath(tf.keras.layers.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + References: + (1) github.com:rwightman/pytorch-image-models + """ + + def __init__(self, drop_path, **kwargs): + super().__init__(**kwargs) + self.drop_path = drop_path + + def call(self, x, training=None): + if training: + keep_prob = 1 - self.drop_path + shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) + random_tensor = keep_prob + tf.random.uniform(shape, 0, 1) + random_tensor = tf.floor(random_tensor) + return (x / keep_prob) * random_tensor + return x + + +class TFConvNextEmbeddings(tf.keras.layers.Layer): + """This class is comparable to (and inspired by) the SwinEmbeddings class + found in src/transformers/models/swin/modeling_swin.py. + """ + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.patch_embeddings = tf.keras.layers.Conv2D( + filters=config.hidden_sizes[0], + kernel_size=config.patch_size, + strides=config.patch_size, + name="patch_embeddings", + kernel_initializer=get_initializer(self.config.initializer_range), + bias_initializer="zeros", + ) + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") + + def call(self, pixel_values): + # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # So change the input format from `NCHW` to `NHWC`. + # shape = (batch_size, in_height, in_width, in_channels=num_channels) + pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) + + embeddings = self.patch_embeddings(pixel_values) + embeddings = self.layernorm(embeddings) + return embeddings + + +class TFConvNextLayer(tf.keras.layers.Layer): + """This corresponds to the `Block` class in the original implementation. + + There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C, + H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back + + The authors used (2) as they find it slightly faster in PyTorch. Since we already permuted the inputs to + follow NHWC ordering, we can just apply the operations straight-away without the permutation. + + Args: + config ([`ConvNextConfig`]): Model configuration class. + dim (`int`): Number of input channels. + drop_path (`float`): Stochastic depth rate. Default: 0.0. + """ + + def __init__(self, config, dim, drop_path=0, **kwargs): + # (sayakpaul): need to figure out the layer names. + super().__init__(**kwargs) + self.dwconv = tf.keras.layers.Conv2D( + filters=dim, + kernel_size=7, + padding="same", + groups=dim, + kernel_initializer=get_initializer(self.config.initializer_range), + bias_initializer="zeros", + ) # depthwise conv + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6) + self.pwconv1 = tf.keras.layers.Dense( + units=4 * dim, + kernel_initializer=get_initializer(self.config.initializer_range), + bias_initializer="zeros", + ) # pointwise/1x1 convs, implemented with linear layers + self.act = get_tf_activation[config.hidden_act] + self.pwconv2 = tf.keras.layers.Dense( + units=dim, + kernel_initializer=get_initializer(self.config.initializer_range), + bias_initializer="zeros", + ) + self.layer_scale_parameter = ( + tf.Variable(config.layer_scale_init_value * tf.ones((dim,)), trainable=True, name="layer_scale_parameter") + if config.layer_scale_init_value > 0 + else None + ) + self.drop_path = TFConvNextDropPath(drop_path) if drop_path > 0.0 else tf.identity + + def call(self, hidden_states): + input = hidden_states + x = self.dwconv(hidden_states) + x = self.layernorm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + + if self.layer_scale_parameter is not None: + x = self.layer_scale_parameter * x + + x = input + self.drop_path(x) + return x + + +class TFConvNextStage(tf.keras.layers.Layer): + """ConvNext stage, consisting of an optional downsampling layer + multiple residual blocks. + + Args: + config ([`ConvNextConfig`]): Model configuration class. + in_channels (`int`): Number of input channels. + out_channels (`int`): Number of output channels. + depth (`int`): Number of residual blocks. + drop_path_rates(`List[float]`): Stochastic depth rates for each layer. + """ + + def __init__( + self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs + ): + # (sayakpaul): need to figure out the names. + super().__init__(**kwargs) + + if in_channels != out_channels or stride > 1: + self.downsampling_layer = tf.keras.Sequential( + [ + tf.keras.layers.LayerNormalization(epsilon=1e-6), + tf.keras.layers.Conv2D( + filters=out_channels, + kernel_size=kernel_size, + strides=stride, + kernel_initializer=get_initializer(self.config.initializer_range), + bias_initializer="zeros", + ), + ] + ) + else: + self.downsampling_layer = tf.identity + drop_path_rates = drop_path_rates or [0.0] * depth + self.layers = tf.keras.Sequential( + [*[TFConvNextLayer(config, dim=out_channels, drop_path=drop_path_rates[j]) for j in range(depth)]] + ) + + def forward(self, hidden_states): + hidden_states = self.downsampling_layer(hidden_states) + hidden_states = self.layers(hidden_states) + return hidden_states + + +class TFConvNextEncoder(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + # (sayakpaul): need to figure out the naming convention for `dwconv`, + # `pwconv1`, and `pwconv2`. + super().__init__(**kwargs) + self.stages = [] + drop_path_rates = [x.item() for x in tf.linspace(0, config.drop_path_rate, sum(config.depths))] + cur = 0 + prev_chs = config.hidden_sizes[0] + for i in range(config.num_stages): + out_chs = config.hidden_sizes[i] + stage = TFConvNextStage( + config, + in_channels=prev_chs, + out_channels=out_chs, + stride=2 if i > 0 else 1, + depth=config.depths[i], + drop_path_rates=drop_path_rates[cur], + ) + self.stages.append(stage) + cur += config.depths[i] + prev_chs = out_chs + + def call(self, hidden_states, output_hidden_states=False, return_dict=True): + all_hidden_states = () if output_hidden_states else None + + for i, layer_module in enumerate(self.stages): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + hidden_states = layer_module(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) + + return TFBaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + ) + + +class TFConvNextPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = ConvNextConfig + base_model_prefix = "convnext" + main_input_name = "pixel_values" + + @property + def dummy_inputs(self) -> Dict[str, tf.Tensor]: + """ + Dummy inputs to build the network. + + Returns: + `Dict[str, tf.Tensor]`: The dummy inputs. + """ + VISION_DUMMY_INPUTS = tf.random.uniform( + shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size), dtype=tf.float32 + ) + return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)} + + @tf.function( + input_signature=[ + { + "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"), + } + ] + ) + def serving(self, inputs): + """ + Method used for serving the model. + + Args: + inputs (`Dict[str, tf.Tensor]`): + The input of the saved model as a dictionary of tensors. + """ + return self.call(inputs) + + +CONVNEXT_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TF 2.0 models accepts two formats as inputs: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional arguments. + + This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the + tensors in the first argument of the model call function: `model(inputs)`. + + + + Parameters: + config ([`ConvNextConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +CONVNEXT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`ConvNextFeatureExtractor`]. See + [`ConvNextFeatureExtractor.__call__`] for details. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used + in eager mode, in graph mode the value will always be set to True. +""" + + +@add_start_docstrings( + "The bare ConvNext model outputting raw features without any specific head on top.", + CONVNEXT_START_DOCSTRING, +) +class TFConvNextModel(TFConvNextPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.config = config + + self.embeddings = TFConvNextEmbeddings(config) + self.encoder = TFConvNextEncoder(config) + + # final layernorm layer + self.layernorm = tf.keras.layers.Layer(epsilon=config.layer_norm_eps) + + # global average pooling + self.pooler = tf.keras.layers.GlobalAvgPool2D() + + @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC) + def call( + self, + pixel_values: Optional[TFModelInputType] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + **kwargs, + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import ConvNextFeatureExtractor, TFConvNextModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") + >>> model = TFConvNextModel.from_pretrained("facebook/convnext-tiny-224") + + >>> inputs = feature_extractor(images=image, return_tensors="tf") + >>> outputs = model(**inputs) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + if "input_ids" in inputs: + inputs["pixel_values"] = inputs.pop("input_ids") + + if inputs["pixel_values"] is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + embedding_output, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + + pooled_output = self.layernorm(self.pooler(last_hidden_state)) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + ) + + +@add_start_docstrings( + """ + ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for + ImageNet. + """, + CONVNEXT_START_DOCSTRING, +) +class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: ConvNextConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + self.convnext = TFConvNextModel(config) + + # Classifier head + self.classifier = tf.keras.layers.Dense( + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="classifier", + ) + + @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + pixel_values: Optional[TFModelInputType] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + + Examples: + + ```python + >>> from transformers import ConvNextFeatureExtractor, TFConvNextForImageClassification + >>> import tensorflow as tf + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = ViTFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") + >>> model = TFViTForImageClassification.from_pretrained("facebook/convnext-tiny-224") + + >>> inputs = feature_extractor(images=image, return_tensors="tf") + >>> outputs = model(**inputs) + >>> logits = outputs.logits + >>> # model predicts one of the 1000 ImageNet classes + >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0] + >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)]) + ```""" + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + training=training, + kwargs_call=kwargs, + ) + + if "input_ids" in inputs: + inputs["pixel_values"] = inputs.pop("input_ids") + + outputs = self.convnext( + inputs["pixel_values"], output_hidden_states=output_hidden_states, return_dict=return_dict + ) + + pooled_output = outputs.pooler_output if return_dict else outputs[1] + + logits = self.classifier(pooled_output) + loss = None if inputs["labels"] is None else self.hf_compute_loss(labels=inputs["labels"], logits=logits) + + if not inputs["return_dict"]: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + ) From 583769c722162fecf30f7c6ad05450996cf38f3c Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 9 Feb 2022 07:23:05 +0530 Subject: [PATCH 02/65] fix: sample code for the classification model. --- src/transformers/models/convnext/modeling_tf_convnext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index e67088ba6d7ea..52e10b4464202 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -460,7 +460,7 @@ def call( >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) - >>> feature_extractor = ViTFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") + >>> feature_extractor = ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") >>> model = TFViTForImageClassification.from_pretrained("facebook/convnext-tiny-224") >>> inputs = feature_extractor(images=image, return_tensors="tf") From c667d93e6a49bd3eb327d1572178792842e3cd27 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 9 Feb 2022 07:30:52 +0530 Subject: [PATCH 03/65] chore: added checked for from the classification model. --- src/transformers/models/convnext/modeling_tf_convnext.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 52e10b4464202..39600954c8df6 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -484,6 +484,9 @@ def call( if "input_ids" in inputs: inputs["pixel_values"] = inputs.pop("input_ids") + if inputs["pixel_values"] is None: + raise ValueError("You have to specify pixel_values") + outputs = self.convnext( inputs["pixel_values"], output_hidden_states=output_hidden_states, return_dict=return_dict ) From 7aecfa9a1f04f9b75eaa5177d5866079702b4d44 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 9 Feb 2022 07:32:25 +0530 Subject: [PATCH 04/65] chore: set bias initializer in the classification head. --- src/transformers/models/convnext/modeling_tf_convnext.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 39600954c8df6..b265de9e406af 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -427,6 +427,7 @@ def __init__(self, config: ConvNextConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), + bias_initializer="zeros", name="classifier", ) From 222c46546568d1c6de6ce4f0aa137d51b32e9527 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 9 Feb 2022 08:02:13 +0530 Subject: [PATCH 05/65] chore: updated license terms. --- src/transformers/models/convnext/modeling_tf_convnext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index b265de9e406af..8defe1e063d79 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2022 Meta Platforms Inc., Sayak Paul, and The HuggingFace Inc. team. All rights reserved. +# Copyright 2022 Meta Platforms Inc. and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 835dbdb99041395b368f468a0f0fdf5350c07e46 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 9 Feb 2022 08:45:25 +0530 Subject: [PATCH 06/65] chore: removed ununsed imports --- src/transformers/models/convnext/modeling_tf_convnext.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 8defe1e063d79..4bf7d42f3deb9 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -14,8 +14,7 @@ # limitations under the License. """ TF 2.0 ConvNext model.""" -import collections.abc -import math + from typing import Dict, Optional, Tuple, Union import numpy as np @@ -30,8 +29,6 @@ TFSequenceClassificationLoss, get_initializer, input_processing, - keras_serializable, - shape_list, ) from ...utils import logging from .configuration_convnext import ConvNextConfig From d6f91b64d125f6aba976848546fd55e6ed2fe7cb Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 9 Feb 2022 09:24:18 +0530 Subject: [PATCH 07/65] feat: enabled argument during using drop_path. --- src/transformers/models/convnext/modeling_tf_convnext.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 4bf7d42f3deb9..3fccfccc4720f 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -134,7 +134,7 @@ def __init__(self, config, dim, drop_path=0, **kwargs): ) self.drop_path = TFConvNextDropPath(drop_path) if drop_path > 0.0 else tf.identity - def call(self, hidden_states): + def call(self, hidden_states, training=False): input = hidden_states x = self.dwconv(hidden_states) x = self.layernorm(x) @@ -145,7 +145,7 @@ def call(self, hidden_states): if self.layer_scale_parameter is not None: x = self.layer_scale_parameter * x - x = input + self.drop_path(x) + x = input + self.drop_path(x, training=training) return x From e1fec885706629faf6c520c1d34a6532d551ba9a Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 10 Feb 2022 12:12:24 +0530 Subject: [PATCH 08/65] chore: replaced tf.identity with layers.Activation(linear). --- src/transformers/models/convnext/modeling_tf_convnext.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 3fccfccc4720f..ebc2a81f3d3a6 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -132,7 +132,9 @@ def __init__(self, config, dim, drop_path=0, **kwargs): if config.layer_scale_init_value > 0 else None ) - self.drop_path = TFConvNextDropPath(drop_path) if drop_path > 0.0 else tf.identity + # Using `layers.Activation` instead of `tf.identity` to better control `training` + # behaviour. + self.drop_path = TFConvNextDropPath(drop_path) if drop_path > 0.0 else tf.keras.layers.Activation("linear") def call(self, hidden_states, training=False): input = hidden_states From 30e4bcb6f4f288b1004557ad93b56f43b02f0384 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 11 Feb 2022 11:09:00 +0530 Subject: [PATCH 09/65] chore: edited default checkpoint. --- src/transformers/models/convnext/modeling_tf_convnext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index ebc2a81f3d3a6..52817a509cb67 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -38,7 +38,7 @@ _CONFIG_FOR_DOC = "ConvNextConfig" -_CHECKPOINT_FOR_DOC = "facebook/ConvNext-tiny-224" +_CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224" class TFConvNextDropPath(tf.keras.layers.Layer): From b0051acee0066ef7c7b9595224e0c78506b11b78 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 11 Feb 2022 22:29:11 +0530 Subject: [PATCH 10/65] fix: minor bugs in the initializations. --- .../models/convnext/modeling_tf_convnext.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 52817a509cb67..bb99f472bc1dd 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -73,7 +73,7 @@ def __init__(self, config, **kwargs): kernel_size=config.patch_size, strides=config.patch_size, name="patch_embeddings", - kernel_initializer=get_initializer(self.config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", ) self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") @@ -104,7 +104,7 @@ class TFConvNextLayer(tf.keras.layers.Layer): drop_path (`float`): Stochastic depth rate. Default: 0.0. """ - def __init__(self, config, dim, drop_path=0, **kwargs): + def __init__(self, config, dim, drop_path=0.0, **kwargs): # (sayakpaul): need to figure out the layer names. super().__init__(**kwargs) self.dwconv = tf.keras.layers.Conv2D( @@ -112,19 +112,19 @@ def __init__(self, config, dim, drop_path=0, **kwargs): kernel_size=7, padding="same", groups=dim, - kernel_initializer=get_initializer(self.config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", ) # depthwise conv self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.pwconv1 = tf.keras.layers.Dense( units=4 * dim, - kernel_initializer=get_initializer(self.config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", ) # pointwise/1x1 convs, implemented with linear layers - self.act = get_tf_activation[config.hidden_act] + self.act = get_tf_activation(config.hidden_act) self.pwconv2 = tf.keras.layers.Dense( units=dim, - kernel_initializer=get_initializer(self.config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", ) self.layer_scale_parameter = ( @@ -176,7 +176,7 @@ def __init__( filters=out_channels, kernel_size=kernel_size, strides=stride, - kernel_initializer=get_initializer(self.config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", ), ] @@ -200,7 +200,7 @@ def __init__(self, config, **kwargs): # `pwconv1`, and `pwconv2`. super().__init__(**kwargs) self.stages = [] - drop_path_rates = [x.item() for x in tf.linspace(0, config.drop_path_rate, sum(config.depths))] + drop_path_rates = [x for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))] cur = 0 prev_chs = config.hidden_sizes[0] for i in range(config.num_stages): @@ -335,7 +335,7 @@ def __init__(self, config, *inputs, **kwargs): self.encoder = TFConvNextEncoder(config) # final layernorm layer - self.layernorm = tf.keras.layers.Layer(epsilon=config.layer_norm_eps) + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps) # global average pooling self.pooler = tf.keras.layers.GlobalAvgPool2D() From aeb14f7329d9543c3b661853b8b6c8f5427fca37 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sat, 12 Feb 2022 01:10:18 +0530 Subject: [PATCH 11/65] partial-fix: tf model errors for loading pretrained pt weights. --- src/transformers/modeling_tf_utils.py | 12 ++++++++---- .../models/convnext/configuration_convnext.py | 3 +++ .../models/convnext/modeling_tf_convnext.py | 2 ++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 54f465215fb2c..be54906fbda6b 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -309,9 +309,13 @@ def booleans_processing(config, **kwargs): final_booleans = {} if tf.executing_eagerly(): - final_booleans["output_attentions"] = ( - kwargs["output_attentions"] if kwargs["output_attentions"] is not None else config.output_attentions - ) + # final_booleans["output_attentions"] = ( + # kwargs["output_attentions"] if kwargs["output_attentions"] else config.output_attentions + # ) + final_booleans["output_attentions"] = kwargs.get("output_attentions", None) + if not final_booleans["output_attentions"]: + final_booleans["output_attentions"] = config.output_attentions + final_booleans["output_hidden_states"] = ( kwargs["output_hidden_states"] if kwargs["output_hidden_states"] is not None @@ -1827,7 +1831,7 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optiona super().__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size - self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range + self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range def build(self, input_shape): """ diff --git a/src/transformers/models/convnext/configuration_convnext.py b/src/transformers/models/convnext/configuration_convnext.py index 8d99c657cc639..c09a54e86a7e2 100644 --- a/src/transformers/models/convnext/configuration_convnext.py +++ b/src/transformers/models/convnext/configuration_convnext.py @@ -85,6 +85,7 @@ def __init__( is_encoder_decoder=False, layer_scale_init_value=1e-6, drop_path_rate=0.0, + image_size=224, **kwargs ): super().__init__(**kwargs) @@ -99,3 +100,5 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.layer_scale_init_value = layer_scale_init_value self.drop_path_rate = drop_path_rate + self.image_size = image_size + self.output_attentions = None diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index bb99f472bc1dd..51f27a8d88a70 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -370,6 +370,8 @@ def call( >>> outputs = model(**inputs) >>> last_hidden_states = outputs.last_hidden_state ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + inputs = input_processing( func=self.call, config=self.config, From aec69dcf1e052aa8e84c731b8dab99c1afaf1229 Mon Sep 17 00:00:00 2001 From: ariG23498 Date: Sat, 12 Feb 2022 02:40:55 +0530 Subject: [PATCH 12/65] partial-fix: call method updated --- src/transformers/models/convnext/modeling_tf_convnext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 51f27a8d88a70..db0d8424576ff 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -188,7 +188,7 @@ def __init__( [*[TFConvNextLayer(config, dim=out_channels, drop_path=drop_path_rates[j]) for j in range(depth)]] ) - def forward(self, hidden_states): + def call(self, hidden_states): hidden_states = self.downsampling_layer(hidden_states) hidden_states = self.layers(hidden_states) return hidden_states From 6c0fae263457b7e496ee331716a0079023fb07af Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sat, 12 Feb 2022 12:03:14 +0530 Subject: [PATCH 13/65] partial-fix: cross loading of weights (4x3 variables to be matched) --- .../models/convnext/modeling_tf_convnext.py | 56 ++++++++++++++----- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index db0d8424576ff..8a8946575a26f 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -68,6 +68,8 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) + # note that we do not use the `base_name` here in `patch_embeddings` + # and `layernorm` self.patch_embeddings = tf.keras.layers.Conv2D( filters=config.hidden_sizes[0], kernel_size=config.patch_size, @@ -105,8 +107,8 @@ class TFConvNextLayer(tf.keras.layers.Layer): """ def __init__(self, config, dim, drop_path=0.0, **kwargs): - # (sayakpaul): need to figure out the layer names. super().__init__(**kwargs) + base_name = kwargs.get("name") self.dwconv = tf.keras.layers.Conv2D( filters=dim, kernel_size=7, @@ -114,27 +116,38 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs): groups=dim, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", + name=f"{base_name}.dwconv", ) # depthwise conv - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6) + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name=f"{base_name}.layernorm") self.pwconv1 = tf.keras.layers.Dense( units=4 * dim, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", + name=f"{base_name}.pwconv1", ) # pointwise/1x1 convs, implemented with linear layers self.act = get_tf_activation(config.hidden_act) self.pwconv2 = tf.keras.layers.Dense( units=dim, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", + name=f"{base_name}.pwconv2", ) self.layer_scale_parameter = ( - tf.Variable(config.layer_scale_init_value * tf.ones((dim,)), trainable=True, name="layer_scale_parameter") + tf.Variable( + config.layer_scale_init_value * tf.ones((dim,)), + trainable=True, + name=f"{base_name}.layer_scale_parameter", + ) if config.layer_scale_init_value > 0 else None ) # Using `layers.Activation` instead of `tf.identity` to better control `training` # behaviour. - self.drop_path = TFConvNextDropPath(drop_path) if drop_path > 0.0 else tf.keras.layers.Activation("linear") + self.drop_path = ( + TFConvNextDropPath(drop_path, name=f"{base_name}.drop_path") + if drop_path > 0.0 + else tf.keras.layers.Activation("linear", name=f"{base_name}.drop_path") + ) def call(self, hidden_states, training=False): input = hidden_states @@ -167,25 +180,37 @@ def __init__( ): # (sayakpaul): need to figure out the names. super().__init__(**kwargs) - + base_name = kwargs.get("name") if in_channels != out_channels or stride > 1: self.downsampling_layer = tf.keras.Sequential( [ - tf.keras.layers.LayerNormalization(epsilon=1e-6), + tf.keras.layers.LayerNormalization( + epsilon=1e-6, + name=f"{base_name}.downsampling_layer.0", + ), tf.keras.layers.Conv2D( filters=out_channels, kernel_size=kernel_size, strides=stride, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", + name=f"{base_name}.downsampling_layer.1", ), - ] + ], ) else: - self.downsampling_layer = tf.identity + self.downsampling_layer = tf.keras.layers.Activation("linear") + drop_path_rates = drop_path_rates or [0.0] * depth self.layers = tf.keras.Sequential( - [*[TFConvNextLayer(config, dim=out_channels, drop_path=drop_path_rates[j]) for j in range(depth)]] + [ + *[ + TFConvNextLayer( + config, dim=out_channels, drop_path=drop_path_rates[j], name=f"{base_name}.layers.{j}" + ) + for j in range(depth) + ] + ], ) def call(self, hidden_states): @@ -199,6 +224,7 @@ def __init__(self, config, **kwargs): # (sayakpaul): need to figure out the naming convention for `dwconv`, # `pwconv1`, and `pwconv2`. super().__init__(**kwargs) + base_name = kwargs.get("name") self.stages = [] drop_path_rates = [x for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))] cur = 0 @@ -212,6 +238,7 @@ def __init__(self, config, **kwargs): stride=2 if i > 0 else 1, depth=config.depths[i], drop_path_rates=drop_path_rates[cur], + name=f"{base_name}.stages.{i}", ) self.stages.append(stage) cur += config.depths[i] @@ -329,13 +356,16 @@ def serving(self, inputs): class TFConvNextModel(TFConvNextPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) + base_name = kwargs.get("name") self.config = config - self.embeddings = TFConvNextEmbeddings(config) - self.encoder = TFConvNextEncoder(config) + # Observe the name parameter in `encoder`, `embeddings`, and `layernorm` + # Adding `base_name` to the embeddings and layernorm adds errors. + self.embeddings = TFConvNextEmbeddings(config, name="embeddings") + self.encoder = TFConvNextEncoder(config, name=f"{base_name}.encoder") # final layernorm layer - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps) + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") # global average pooling self.pooler = tf.keras.layers.GlobalAvgPool2D() @@ -422,7 +452,7 @@ def __init__(self, config: ConvNextConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.convnext = TFConvNextModel(config) + self.convnext = TFConvNextModel(config, name="convnext") # Classifier head self.classifier = tf.keras.layers.Dense( From ee62db49f859db2300c73472ef74db290f582a92 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:08:14 +0530 Subject: [PATCH 14/65] chore: removed unneeded comment. --- playground.py | 38 +++++++++++++++++++ .../models/convnext/modeling_tf_convnext.py | 2 - 2 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 playground.py diff --git a/playground.py b/playground.py new file mode 100644 index 0000000000000..40e83b0c9d107 --- /dev/null +++ b/playground.py @@ -0,0 +1,38 @@ +import tensorflow as tf +from transformers import AutoFeatureExtractor + +# import your TFConvNextForImageClassification class here, we will take care +# of adding the boilerplate to run `from transformers import +# TFConvNextForImageClassification` later +from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification +from transformers import ConvNextForImageClassification + +from PIL import Image + +# model = ConvNextForImageClassification.from_pretrained( +# "facebook/convnext-tiny-224", +# ) +# print(f"Model State Dict:\n") +# all_keys = list(model.state_dict().keys()) +# print([k for k in all_keys if "layer_scale" in k]) + +model = TFConvNextForImageClassification.from_pretrained( + "facebook/convnext-tiny-224", + from_pt=True, +) # notice the `from_pt` argument +print(model.summary(expand_nested=True)) + + +# feature_extractor = AutoFeatureExtractor.from_pretrained( +# "facebook/convnext-tiny-224" +# ) # don't know if this is supposed to work with TF as well, change this as needed + +# image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path +# inputs = feature_extractor(images=image, return_tensors="tf") + +# # forward pass +# outputs = model(**inputs) + +# # verify the logits +# assert outputs.logits.shape == [1, 1000] +# tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 8a8946575a26f..75ab13c9662e4 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -221,8 +221,6 @@ def call(self, hidden_states): class TFConvNextEncoder(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - # (sayakpaul): need to figure out the naming convention for `dwconv`, - # `pwconv1`, and `pwconv2`. super().__init__(**kwargs) base_name = kwargs.get("name") self.stages = [] From 8c1d6a3e00fd608b0c1761c9fdd9077a0afb7b9a Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:14:29 +0530 Subject: [PATCH 15/65] removed playground.py --- playground.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 playground.py diff --git a/playground.py b/playground.py deleted file mode 100644 index 40e83b0c9d107..0000000000000 --- a/playground.py +++ /dev/null @@ -1,38 +0,0 @@ -import tensorflow as tf -from transformers import AutoFeatureExtractor - -# import your TFConvNextForImageClassification class here, we will take care -# of adding the boilerplate to run `from transformers import -# TFConvNextForImageClassification` later -from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification -from transformers import ConvNextForImageClassification - -from PIL import Image - -# model = ConvNextForImageClassification.from_pretrained( -# "facebook/convnext-tiny-224", -# ) -# print(f"Model State Dict:\n") -# all_keys = list(model.state_dict().keys()) -# print([k for k in all_keys if "layer_scale" in k]) - -model = TFConvNextForImageClassification.from_pretrained( - "facebook/convnext-tiny-224", - from_pt=True, -) # notice the `from_pt` argument -print(model.summary(expand_nested=True)) - - -# feature_extractor = AutoFeatureExtractor.from_pretrained( -# "facebook/convnext-tiny-224" -# ) # don't know if this is supposed to work with TF as well, change this as needed - -# image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path -# inputs = feature_extractor(images=image, return_tensors="tf") - -# # forward pass -# outputs = model(**inputs) - -# # verify the logits -# assert outputs.logits.shape == [1, 1000] -# tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From 490adf887b019ca5adbcc15d5b537898b19c54a7 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:18:41 +0530 Subject: [PATCH 16/65] rebasing --- playground.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 playground.py diff --git a/playground.py b/playground.py new file mode 100644 index 0000000000000..8a53d5babd2be --- /dev/null +++ b/playground.py @@ -0,0 +1,38 @@ +import tensorflow as tf +from transformers import AutoFeatureExtractor + +# import your TFConvNextForImageClassification class here, we will take care +# of adding the boilerplate to run `from transformers import +# TFConvNextForImageClassification` later +from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification +from transformers import ConvNextForImageClassification + +from PIL import Image + +# model = ConvNextForImageClassification.from_pretrained( +# "facebook/convnext-tiny-224", +# ) +# print(f"Model State Dict:\n") +# all_keys = list(model.state_dict().keys()) +# print([k for k in all_keys if "layer_scale" in k]) + +model = TFConvNextForImageClassification.from_pretrained( + "facebook/convnext-tiny-224", + from_pt=True, +) # notice the `from_pt` argument +print(model.summary(expand_nested=True)) + + +feature_extractor = AutoFeatureExtractor.from_pretrained( + "facebook/convnext-tiny-224" +) # don't know if this is supposed to work with TF as well, change this as needed + +image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path +inputs = feature_extractor(images=image, return_tensors="tf") + +# forward pass +outputs = model(**inputs) + +# verify the logits +assert outputs.logits.shape == [1, 1000] +tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From fa494693ca9e3dd8a693440c9c8c7c4ac411f686 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:19:23 +0530 Subject: [PATCH 17/65] rebasing and removing playground.py. --- playground.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 playground.py diff --git a/playground.py b/playground.py deleted file mode 100644 index 8a53d5babd2be..0000000000000 --- a/playground.py +++ /dev/null @@ -1,38 +0,0 @@ -import tensorflow as tf -from transformers import AutoFeatureExtractor - -# import your TFConvNextForImageClassification class here, we will take care -# of adding the boilerplate to run `from transformers import -# TFConvNextForImageClassification` later -from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification -from transformers import ConvNextForImageClassification - -from PIL import Image - -# model = ConvNextForImageClassification.from_pretrained( -# "facebook/convnext-tiny-224", -# ) -# print(f"Model State Dict:\n") -# all_keys = list(model.state_dict().keys()) -# print([k for k in all_keys if "layer_scale" in k]) - -model = TFConvNextForImageClassification.from_pretrained( - "facebook/convnext-tiny-224", - from_pt=True, -) # notice the `from_pt` argument -print(model.summary(expand_nested=True)) - - -feature_extractor = AutoFeatureExtractor.from_pretrained( - "facebook/convnext-tiny-224" -) # don't know if this is supposed to work with TF as well, change this as needed - -image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path -inputs = feature_extractor(images=image, return_tensors="tf") - -# forward pass -outputs = model(**inputs) - -# verify the logits -assert outputs.logits.shape == [1, 1000] -tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From acb6fa006a9596348f06da5247a101154c8bc3c5 Mon Sep 17 00:00:00 2001 From: ariG23498 Date: Mon, 14 Feb 2022 13:41:43 +0530 Subject: [PATCH 18/65] fix: renaming TFConvNextStage conv and layer norm layers --- src/transformers/models/convnext/modeling_tf_convnext.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 75ab13c9662e4..dd90d472633a1 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -186,7 +186,7 @@ def __init__( [ tf.keras.layers.LayerNormalization( epsilon=1e-6, - name=f"{base_name}.downsampling_layer.0", + name=f"{base_name}/{base_name}.downsampling_layer.0", ), tf.keras.layers.Conv2D( filters=out_channels, @@ -194,12 +194,12 @@ def __init__( strides=stride, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", - name=f"{base_name}.downsampling_layer.1", + name=f"{base_name}/{base_name}.downsampling_layer.1", ), ], ) else: - self.downsampling_layer = tf.keras.layers.Activation("linear") + self.downsampling_layer = tf.identity drop_path_rates = drop_path_rates or [0.0] * depth self.layers = tf.keras.Sequential( From 8d56711c4ad2787eb0ce6f9f5d151a6c25f18626 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 14 Feb 2022 15:20:41 +0530 Subject: [PATCH 19/65] chore: added initializers and other minor additions. --- docs/source/index.mdx | 2 +- docs/source/model_doc/convnext.mdx | 17 ++++++++- playground.py | 38 +++++++++++++++++++ src/transformers/__init__.py | 4 ++ .../models/auto/modeling_tf_auto.py | 2 + src/transformers/models/convnext/__init__.py | 8 +++- src/transformers/utils/dummy_tf_objects.py | 21 ++++++++++ tests/test_modeling_tf_common.py | 8 +++- 8 files changed, 95 insertions(+), 5 deletions(-) create mode 100644 playground.py diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 9ee4377110cd8..37f3efb7e2b85 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -177,7 +177,7 @@ Flax), PyTorch, and/or TensorFlow. | Canine | ✅ | ❌ | ✅ | ❌ | ❌ | | CLIP | ✅ | ✅ | ✅ | ✅ | ✅ | | ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| ConvNext | ❌ | ❌ | ✅ | ❌ | ❌ | +| ConvNext | ❌ | ❌ | ✅ | ✅ | ❌ | | CTRL | ✅ | ❌ | ✅ | ✅ | ❌ | | DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ | | DeBERTa-v2 | ✅ | ❌ | ✅ | ✅ | ❌ | diff --git a/docs/source/model_doc/convnext.mdx b/docs/source/model_doc/convnext.mdx index e3a04d371e64c..c2323402beabf 100644 --- a/docs/source/model_doc/convnext.mdx +++ b/docs/source/model_doc/convnext.mdx @@ -37,7 +37,8 @@ alt="drawing" width="600"/> ConvNeXT architecture. Taken from the original paper. -This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt). +This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [ariG23498](https://github.com/ariG23498) +and [sayakpaul](https://github.com/sayakpaul). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt). ## ConvNeXT specific outputs @@ -63,4 +64,16 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi ## ConvNextForImageClassification [[autodoc]] ConvNextForImageClassification - - forward \ No newline at end of file + - forward + + +## TFConvNextModel + +[[autodoc]] TFConvNextModel + - call + + +## TFConvNextForImageClassification + +[[autodoc]] TFConvNextForImageClassification + - call \ No newline at end of file diff --git a/playground.py b/playground.py new file mode 100644 index 0000000000000..8a53d5babd2be --- /dev/null +++ b/playground.py @@ -0,0 +1,38 @@ +import tensorflow as tf +from transformers import AutoFeatureExtractor + +# import your TFConvNextForImageClassification class here, we will take care +# of adding the boilerplate to run `from transformers import +# TFConvNextForImageClassification` later +from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification +from transformers import ConvNextForImageClassification + +from PIL import Image + +# model = ConvNextForImageClassification.from_pretrained( +# "facebook/convnext-tiny-224", +# ) +# print(f"Model State Dict:\n") +# all_keys = list(model.state_dict().keys()) +# print([k for k in all_keys if "layer_scale" in k]) + +model = TFConvNextForImageClassification.from_pretrained( + "facebook/convnext-tiny-224", + from_pt=True, +) # notice the `from_pt` argument +print(model.summary(expand_nested=True)) + + +feature_extractor = AutoFeatureExtractor.from_pretrained( + "facebook/convnext-tiny-224" +) # don't know if this is supposed to work with TF as well, change this as needed + +image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path +inputs = feature_extractor(images=image, return_tensors="tf") + +# forward pass +outputs = model(**inputs) + +# verify the logits +assert outputs.logits.shape == [1, 1000] +tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index f4b0e2908b61d..c9e6feec10fcb 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -852,6 +852,9 @@ "ConvNextForImageClassification", "ConvNextModel", "ConvNextPreTrainedModel", + "TFConvNextForImageClassification", + "TFConvNextModel", + "TFConvNextPreTrainedModel", ] ) _import_structure["models.ctrl"].extend( @@ -3680,6 +3683,7 @@ TFConvBertModel, TFConvBertPreTrainedModel, ) + from .models.convnext import TFConvNextForImageClassification, TFConvNextModel, TFConvNextPreTrainedModel from .models.ctrl import ( TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, TFCTRLForSequenceClassification, diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py index cd4158bc7dd46..1b95cfa01d545 100644 --- a/src/transformers/models/auto/modeling_tf_auto.py +++ b/src/transformers/models/auto/modeling_tf_auto.py @@ -36,6 +36,7 @@ ("rembert", "TFRemBertModel"), ("roformer", "TFRoFormerModel"), ("convbert", "TFConvBertModel"), + ("convnext", "TFConvNextModel"), ("led", "TFLEDModel"), ("lxmert", "TFLxmertModel"), ("mt5", "TFMT5Model"), @@ -155,6 +156,7 @@ [ # Model for Image-classsification ("vit", "TFViTForImageClassification"), + ("convnext", "TFConvNextForImageClassification"), ] ) diff --git a/src/transformers/models/convnext/__init__.py b/src/transformers/models/convnext/__init__.py index cdc064d3c994a..995d38f80998d 100644 --- a/src/transformers/models/convnext/__init__.py +++ b/src/transformers/models/convnext/__init__.py @@ -18,7 +18,7 @@ from typing import TYPE_CHECKING # rely on isort to merge the imports -from ...file_utils import _LazyModule, is_torch_available, is_vision_available +from ...file_utils import _LazyModule, is_torch_available, is_tf_available, is_vision_available _import_structure = { @@ -36,6 +36,12 @@ "ConvNextPreTrainedModel", ] +if is_tf_available(): + _import_structure["modeling_tf_convnext"] = [ + "TFConvNextForImageClassification", + "TFConvNextModel", + "TFConvNextPreTrainedModel", + ] if TYPE_CHECKING: from .configuration_convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py index 02b401ef394ec..d70937fe19d0c 100644 --- a/src/transformers/utils/dummy_tf_objects.py +++ b/src/transformers/utils/dummy_tf_objects.py @@ -599,6 +599,27 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["tf"]) +class TFConvNextForImageClassification(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFConvNextModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFConvNextPreTrainedModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 95c953a6e3aec..6b442c1ffc1d9 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -475,7 +475,13 @@ def test_compile_tf_model(self): "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"), } # TODO: A better way to handle vision models - elif model_class.__name__ in ["TFViTModel", "TFViTForImageClassification", "TFCLIPVisionModel"]: + elif model_class.__name__ in [ + "TFConvNextModel", + "TFConvNextForImageClassification", + "TFViTModel", + "TFViTForImageClassification", + "TFCLIPVisionModel", + ]: inputs = tf.keras.Input( batch_shape=( 3, From 11b0683dc5024ede739d1a7c57b1e39b1843976f Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 14 Feb 2022 15:21:06 +0530 Subject: [PATCH 20/65] chore: added initializers and other minor additions. --- playground.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 playground.py diff --git a/playground.py b/playground.py deleted file mode 100644 index 8a53d5babd2be..0000000000000 --- a/playground.py +++ /dev/null @@ -1,38 +0,0 @@ -import tensorflow as tf -from transformers import AutoFeatureExtractor - -# import your TFConvNextForImageClassification class here, we will take care -# of adding the boilerplate to run `from transformers import -# TFConvNextForImageClassification` later -from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification -from transformers import ConvNextForImageClassification - -from PIL import Image - -# model = ConvNextForImageClassification.from_pretrained( -# "facebook/convnext-tiny-224", -# ) -# print(f"Model State Dict:\n") -# all_keys = list(model.state_dict().keys()) -# print([k for k in all_keys if "layer_scale" in k]) - -model = TFConvNextForImageClassification.from_pretrained( - "facebook/convnext-tiny-224", - from_pt=True, -) # notice the `from_pt` argument -print(model.summary(expand_nested=True)) - - -feature_extractor = AutoFeatureExtractor.from_pretrained( - "facebook/convnext-tiny-224" -) # don't know if this is supposed to work with TF as well, change this as needed - -image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path -inputs = feature_extractor(images=image, return_tensors="tf") - -# forward pass -outputs = model(**inputs) - -# verify the logits -assert outputs.logits.shape == [1, 1000] -tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From fd0ca7fa8e28f3477f206ce0b223ab2e9f00ae94 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 14 Feb 2022 18:08:04 +0530 Subject: [PATCH 21/65] add: tests for convnext. --- .../models/convnext/modeling_tf_convnext.py | 3 + tests/test_modeling_tf_convnext.py | 243 ++++++++++++++++++ 2 files changed, 246 insertions(+) create mode 100644 tests/test_modeling_tf_convnext.py diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index dd90d472633a1..2d7d24860f0fb 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -84,6 +84,9 @@ def call(self, pixel_values): # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. # So change the input format from `NCHW` to `NHWC`. # shape = (batch_size, in_height, in_width, in_channels=num_channels) + if isinstance(pixel_values, dict): + pixel_values = pixel_values["pixel_values"] + pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) embeddings = self.patch_embeddings(pixel_values) diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py new file mode 100644 index 0000000000000..b2bcc980ae946 --- /dev/null +++ b/tests/test_modeling_tf_convnext.py @@ -0,0 +1,243 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the TensorFlow ConvNext model. """ + +import unittest +import inspect + +from transformers import ConvNextConfig +from transformers.file_utils import cached_property, is_tf_available, is_vision_available +from transformers.testing_utils import require_tf, require_vision, slow + +from .test_configuration_common import ConfigTester +from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor + + +if is_tf_available(): + import tensorflow as tf + + from transformers import TFConvNextForImageClassification, TFConvNextModel + + +if is_vision_available(): + from PIL import Image + + from transformers import ConvNextFeatureExtractor + + +class TFConvNextModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=32, + num_channels=3, + num_stages=4, + hidden_sizes=[10, 20, 30, 40], + depths=[2, 2, 3, 2], + is_training=True, + use_labels=True, + intermediate_size=37, + hidden_act="gelu", + type_sequence_label_size=10, + initializer_range=0.02, + num_labels=3, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.num_channels = num_channels + self.num_stages = num_stages + self.hidden_sizes = hidden_sizes + self.depths = depths + self.is_training = is_training + self.use_labels = use_labels + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return ConvNextConfig( + num_channels=self.num_channels, + hidden_sizes=self.hidden_sizes, + depths=self.depths, + num_stages=self.num_stages, + hidden_act=self.hidden_act, + is_decoder=False, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = TFConvNextModel(config=config) + result = model(pixel_values, training=False) + # expected last hidden states: B, H // 32, W // 32, C + self.parent.assertEqual( + result.last_hidden_state.shape, + (self.batch_size, self.image_size // 32, self.image_size // 32, self.hidden_sizes[-1]), + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = TFConvNextForImageClassification(config) + result = model(pixel_values, labels=labels, training=False) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_tf +class TFConvNextModelTest(TFModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as ConvNext does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = ( + ( + TFConvNextModel, + TFConvNextForImageClassification, + ) + if is_tf_available() + else () + ) + + test_pruning = False + test_onnx = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = TFConvNextModelTester(self) + self.config_tester = ConfigTester(self, config_class=ConvNextConfig, has_text_modality=False, hidden_size=37) + + @unittest.skip(reason="ConvNext does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="ConvNext does not support input and output embeddings") + def test_model_common_attributes(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip(reason="Model doesn't have attention layers") + def test_attention_outputs(self): + pass + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_stages = self.model_tester.num_stages + self.assertEqual(len(hidden_states), expected_num_stages + 1) + + # ConvNext's feature maps are of shape (batch_size, height, width, num_channels) in TF + self.assertListEqual( + list(hidden_states[0].shape[1:-1]), + [self.model_tester.image_size // 4, self.model_tester.image_size // 4], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model = TFConvNextModel.from_pretrained("facebook/convnext-tiny-224") + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_tf +@require_vision +class TFViTModelIntegrationTest(unittest.TestCase): + @cached_property + def default_feature_extractor(self): + return ConvNextFeatureExtractor.from_pretrained("acebook/convnext-tiny-224") if is_vision_available() else None + + @slow + def test_inference_image_classification_head(self): + model = TFConvNextForImageClassification.from_pretrained( + "acebook/convnext-tiny-224", + from_pt=True, + ) + + feature_extractor = self.default_feature_extractor + image = prepare_img() + inputs = feature_extractor(images=image, return_tensors="tf") + + # forward pass + outputs = model(**inputs) + + # verify the logits + expected_shape = tf.TensorShape((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = tf.constant([-0.0260, -0.4739, 0.1911]) + + tf.debugging.assert_near(outputs.logits[0, :3], expected_slice, atol=1e-4) From 98911a249fa29a40f44d7cc25392e0f25c7a4c36 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 14 Feb 2022 20:42:05 +0530 Subject: [PATCH 22/65] fix: integration tester class. --- tests/test_modeling_tf_convnext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py index b2bcc980ae946..d70d9d35d8d57 100644 --- a/tests/test_modeling_tf_convnext.py +++ b/tests/test_modeling_tf_convnext.py @@ -215,7 +215,7 @@ def prepare_img(): @require_tf @require_vision -class TFViTModelIntegrationTest(unittest.TestCase): +class TFConvNextModelIntegrationTest(unittest.TestCase): @cached_property def default_feature_extractor(self): return ConvNextFeatureExtractor.from_pretrained("acebook/convnext-tiny-224") if is_vision_available() else None From b30a8ccb4ccb75ed914c2af982d102a023bda4de Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 16 Feb 2022 07:09:00 +0530 Subject: [PATCH 23/65] fix: issues mentioned in pr feedback (round 1). --- src/transformers/modeling_tf_utils.py | 3 --- .../models/convnext/modeling_tf_convnext.py | 1 - tests/test_modeling_tf_common.py | 10 ++-------- tests/test_modeling_tf_convnext.py | 6 ++++-- 4 files changed, 6 insertions(+), 14 deletions(-) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 1d62180f29f5e..f5249a8e76b07 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -309,9 +309,6 @@ def booleans_processing(config, **kwargs): final_booleans = {} if tf.executing_eagerly(): - # final_booleans["output_attentions"] = ( - # kwargs["output_attentions"] if kwargs["output_attentions"] else config.output_attentions - # ) final_booleans["output_attentions"] = kwargs.get("output_attentions", None) if not final_booleans["output_attentions"]: final_booleans["output_attentions"] = config.output_attentions diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 2d7d24860f0fb..d355a5663709f 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -181,7 +181,6 @@ class TFConvNextStage(tf.keras.layers.Layer): def __init__( self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs ): - # (sayakpaul): need to figure out the names. super().__init__(**kwargs) base_name = kwargs.get("name") if in_channels != out_channels or stride > 1: diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 6b442c1ffc1d9..6fe6ea3b52e16 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -474,14 +474,8 @@ def test_compile_tf_model(self): ), "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"), } - # TODO: A better way to handle vision models - elif model_class.__name__ in [ - "TFConvNextModel", - "TFConvNextForImageClassification", - "TFViTModel", - "TFViTForImageClassification", - "TFCLIPVisionModel", - ]: + # `pixel_values` implies that the input is an image + elif model_class.main_input_name == "pixel_values": inputs = tf.keras.Input( batch_shape=( 3, diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py index d70d9d35d8d57..c47a895453b87 100644 --- a/tests/test_modeling_tf_convnext.py +++ b/tests/test_modeling_tf_convnext.py @@ -218,12 +218,14 @@ def prepare_img(): class TFConvNextModelIntegrationTest(unittest.TestCase): @cached_property def default_feature_extractor(self): - return ConvNextFeatureExtractor.from_pretrained("acebook/convnext-tiny-224") if is_vision_available() else None + return ( + ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None + ) @slow def test_inference_image_classification_head(self): model = TFConvNextForImageClassification.from_pretrained( - "acebook/convnext-tiny-224", + "facebook/convnext-tiny-224", from_pt=True, ) From 2181d5b2fddd23ec214c6601a2beabbbcdd75dc8 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 16 Feb 2022 07:58:49 +0530 Subject: [PATCH 24/65] fix: how output_hidden_states arg is propoagated inside the network. --- src/transformers/models/convnext/modeling_tf_convnext.py | 3 +++ tests/test_modeling_tf_convnext.py | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index d355a5663709f..7b899723c1589 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -400,6 +400,9 @@ def call( >>> outputs = model(**inputs) >>> last_hidden_states = outputs.last_hidden_state ```""" + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict inputs = input_processing( diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py index c47a895453b87..2bb05b1aed953 100644 --- a/tests/test_modeling_tf_convnext.py +++ b/tests/test_modeling_tf_convnext.py @@ -173,7 +173,6 @@ def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states expected_num_stages = self.model_tester.num_stages From cc98979f3177394c3f0e4b6c41093fcbce07bec5 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 16 Feb 2022 08:31:18 +0530 Subject: [PATCH 25/65] feat: handling of arg for pure cnn models. --- tests/test_modeling_tf_common.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 6fe6ea3b52e16..822656a3afbff 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -475,7 +475,7 @@ def test_compile_tf_model(self): "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"), } # `pixel_values` implies that the input is an image - elif model_class.main_input_name == "pixel_values": + elif model_class.main_input_name == "pixel_values": inputs = tf.keras.Input( batch_shape=( 3, @@ -799,23 +799,27 @@ def recursive_check(tuple_object, dict_object): dict_inputs = self._prepare_for_class(inputs_dict, model_class) check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) - tuple_inputs = self._prepare_for_class(inputs_dict, model_class) - dict_inputs = self._prepare_for_class(inputs_dict, model_class) - check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + # Pure conv models (such as ConvNeXt) don't have `output_attentions`. + if config.output_attentions: + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) - tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + if config.output_attentions: + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) - tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - check_equivalence( - model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} - ) + if config.output_attentions: + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence( + model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} + ) def test_inputs_embeds(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() From 12e4505d5d4d259eba51fccc9bd0c5ad69b2755d Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 16 Feb 2022 08:34:05 +0530 Subject: [PATCH 26/65] chore: added a note on equal contribution in model docs. --- docs/source/model_doc/convnext.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/model_doc/convnext.mdx b/docs/source/model_doc/convnext.mdx index c2323402beabf..f2e789b36916a 100644 --- a/docs/source/model_doc/convnext.mdx +++ b/docs/source/model_doc/convnext.mdx @@ -38,7 +38,7 @@ alt="drawing" width="600"/> ConvNeXT architecture. Taken from the original paper. This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [ariG23498](https://github.com/ariG23498) -and [sayakpaul](https://github.com/sayakpaul). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt). +and [sayakpaul](https://github.com/sayakpaul) (equal contribution). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt). ## ConvNeXT specific outputs From eb493386372ce42939e71c29083574049e3aa7a9 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:18:41 +0530 Subject: [PATCH 27/65] rebasing --- playground.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 playground.py diff --git a/playground.py b/playground.py new file mode 100644 index 0000000000000..8a53d5babd2be --- /dev/null +++ b/playground.py @@ -0,0 +1,38 @@ +import tensorflow as tf +from transformers import AutoFeatureExtractor + +# import your TFConvNextForImageClassification class here, we will take care +# of adding the boilerplate to run `from transformers import +# TFConvNextForImageClassification` later +from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification +from transformers import ConvNextForImageClassification + +from PIL import Image + +# model = ConvNextForImageClassification.from_pretrained( +# "facebook/convnext-tiny-224", +# ) +# print(f"Model State Dict:\n") +# all_keys = list(model.state_dict().keys()) +# print([k for k in all_keys if "layer_scale" in k]) + +model = TFConvNextForImageClassification.from_pretrained( + "facebook/convnext-tiny-224", + from_pt=True, +) # notice the `from_pt` argument +print(model.summary(expand_nested=True)) + + +feature_extractor = AutoFeatureExtractor.from_pretrained( + "facebook/convnext-tiny-224" +) # don't know if this is supposed to work with TF as well, change this as needed + +image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path +inputs = feature_extractor(images=image, return_tensors="tf") + +# forward pass +outputs = model(**inputs) + +# verify the logits +assert outputs.logits.shape == [1, 1000] +tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From 5e01b71b1543b066491ecc809124ff34cfde9e8a Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:19:23 +0530 Subject: [PATCH 28/65] rebasing and removing playground.py. --- playground.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 playground.py diff --git a/playground.py b/playground.py deleted file mode 100644 index 8a53d5babd2be..0000000000000 --- a/playground.py +++ /dev/null @@ -1,38 +0,0 @@ -import tensorflow as tf -from transformers import AutoFeatureExtractor - -# import your TFConvNextForImageClassification class here, we will take care -# of adding the boilerplate to run `from transformers import -# TFConvNextForImageClassification` later -from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification -from transformers import ConvNextForImageClassification - -from PIL import Image - -# model = ConvNextForImageClassification.from_pretrained( -# "facebook/convnext-tiny-224", -# ) -# print(f"Model State Dict:\n") -# all_keys = list(model.state_dict().keys()) -# print([k for k in all_keys if "layer_scale" in k]) - -model = TFConvNextForImageClassification.from_pretrained( - "facebook/convnext-tiny-224", - from_pt=True, -) # notice the `from_pt` argument -print(model.summary(expand_nested=True)) - - -feature_extractor = AutoFeatureExtractor.from_pretrained( - "facebook/convnext-tiny-224" -) # don't know if this is supposed to work with TF as well, change this as needed - -image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path -inputs = feature_extractor(images=image, return_tensors="tf") - -# forward pass -outputs = model(**inputs) - -# verify the logits -assert outputs.logits.shape == [1, 1000] -tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From 908d0cf85b6c40c9de7c0dd6c77c65287472c7f2 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 17 Feb 2022 09:49:25 +0530 Subject: [PATCH 29/65] feat: encapsulation for the convnext trunk. --- .../models/convnext/modeling_tf_convnext.py | 176 +++++++++++++----- 1 file changed, 131 insertions(+), 45 deletions(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 7b899723c1589..1a37948647a70 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -29,6 +29,7 @@ TFSequenceClassificationLoss, get_initializer, input_processing, + keras_serializable, ) from ...utils import logging from .configuration_convnext import ConvNextConfig @@ -81,12 +82,12 @@ def __init__(self, config, **kwargs): self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") def call(self, pixel_values): - # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. - # So change the input format from `NCHW` to `NHWC`. - # shape = (batch_size, in_height, in_width, in_channels=num_channels) if isinstance(pixel_values, dict): pixel_values = pixel_values["pixel_values"] + # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # So change the input format from `NCHW` to `NHWC`. + # shape = (batch_size, in_height, in_width, in_channels=num_channels) pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) embeddings = self.patch_embeddings(pixel_values) @@ -119,27 +120,35 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs): groups=dim, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", - name=f"{base_name}.dwconv", + name="dwconv", + # name=f"{base_name}.dwconv", ) # depthwise conv - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name=f"{base_name}.layernorm") + self.layernorm = tf.keras.layers.LayerNormalization( + epsilon=1e-6, + name="layernorm", + # name=f"{base_name}.layernorm" + ) self.pwconv1 = tf.keras.layers.Dense( units=4 * dim, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", - name=f"{base_name}.pwconv1", + name="pwconv1", + # name=f"{base_name}.pwconv1", ) # pointwise/1x1 convs, implemented with linear layers self.act = get_tf_activation(config.hidden_act) self.pwconv2 = tf.keras.layers.Dense( units=dim, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", - name=f"{base_name}.pwconv2", + name="pwconv2", + # name=f"{base_name}.pwconv2", ) self.layer_scale_parameter = ( tf.Variable( config.layer_scale_init_value * tf.ones((dim,)), trainable=True, - name=f"{base_name}.layer_scale_parameter", + name="layer_scale_parameter", + # name=f"{base_name}.layer_scale_parameter", ) if config.layer_scale_init_value > 0 else None @@ -147,9 +156,17 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs): # Using `layers.Activation` instead of `tf.identity` to better control `training` # behaviour. self.drop_path = ( - TFConvNextDropPath(drop_path, name=f"{base_name}.drop_path") + TFConvNextDropPath( + drop_path, + name="drop_path", + # name=f"{base_name}.drop_path" + ) if drop_path > 0.0 - else tf.keras.layers.Activation("linear", name=f"{base_name}.drop_path") + else tf.keras.layers.Activation( + "linear", + name="drop_path", + # name=f"{base_name}.drop_path" + ) ) def call(self, hidden_states, training=False): @@ -188,7 +205,8 @@ def __init__( [ tf.keras.layers.LayerNormalization( epsilon=1e-6, - name=f"{base_name}/{base_name}.downsampling_layer.0", + name="downsampling_layer.0", + # name=f"{base_name}/{base_name}.downsampling_layer.0", ), tf.keras.layers.Conv2D( filters=out_channels, @@ -196,7 +214,8 @@ def __init__( strides=stride, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", - name=f"{base_name}/{base_name}.downsampling_layer.1", + name="downsampling_layer.1", + # name=f"{base_name}/{base_name}.downsampling_layer.1", ), ], ) @@ -208,7 +227,11 @@ def __init__( [ *[ TFConvNextLayer( - config, dim=out_channels, drop_path=drop_path_rates[j], name=f"{base_name}.layers.{j}" + config, + dim=out_channels, + drop_path=drop_path_rates[j], + name=f"layers.{j}", + # name=f"{base_name}.layers.{j}" ) for j in range(depth) ] @@ -238,7 +261,8 @@ def __init__(self, config, **kwargs): stride=2 if i > 0 else 1, depth=config.depths[i], drop_path_rates=drop_path_rates[cur], - name=f"{base_name}.stages.{i}", + name=f"stages.{i}", + # name=f"{base_name}.stages.{i}", ) self.stages.append(stage) cur += config.depths[i] @@ -265,6 +289,73 @@ def call(self, hidden_states, output_hidden_states=False, return_dict=True): ) +@keras_serializable +class TFConvNextMainLayer(tf.keras.layers.Layer): + config_class = ConvNextConfig + + def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs): + super().__init__(**kwargs) + + self.config = config + base_name = kwargs.get("name") + self.embeddings = TFConvNextEmbeddings(config, name="embeddings") + self.encoder = TFConvNextEncoder(config, name="encoder") + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + self.pooler = tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None + + def call( + self, + pixel_values: Optional[TFModelInputType] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + **kwargs, + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + if "input_ids" in inputs: + inputs["pixel_values"] = inputs.pop("input_ids") + + if inputs["pixel_values"] is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.embeddings(inputs["pixel_values"], training=inputs["training"]) + + encoder_outputs = self.encoder( + embedding_output, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=inputs["training"], + ) + + last_hidden_state = encoder_outputs[0] + # print(f"From modeling TF: {type(self.pooler), type(self.layernorm)}") + # print(f"From modeling TF: {last_hidden_state}") + pooled_output = self.layernorm(self.pooler(last_hidden_state)) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + ) + + class TFConvNextPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained @@ -354,21 +445,21 @@ def serving(self, inputs): CONVNEXT_START_DOCSTRING, ) class TFConvNextModel(TFConvNextPreTrainedModel): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config, *inputs, add_pooling_layer=True, **kwargs): super().__init__(config, *inputs, **kwargs) - base_name = kwargs.get("name") - self.config = config + # base_name = kwargs.get("name") + # self.config = config + self.convnext = TFConvNextMainLayer(config, add_pooling_layer=add_pooling_layer, name="convnext") + # # Observe the name parameter in `encoder`, `embeddings`, and `layernorm` + # # Adding `base_name` to the embeddings and layernorm adds errors. + # self.embeddings = TFConvNextEmbeddings(config, name="embeddings") + # self.encoder = TFConvNextEncoder(config, name=f"{base_name}.encoder") - # Observe the name parameter in `encoder`, `embeddings`, and `layernorm` - # Adding `base_name` to the embeddings and layernorm adds errors. - self.embeddings = TFConvNextEmbeddings(config, name="embeddings") - self.encoder = TFConvNextEncoder(config, name=f"{base_name}.encoder") - - # final layernorm layer - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + # # final layernorm layer + # self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") - # global average pooling - self.pooler = tf.keras.layers.GlobalAvgPool2D() + # # global average pooling + # self.pooler = tf.keras.layers.GlobalAvgPool2D() @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC) @@ -421,26 +512,13 @@ def call( if inputs["pixel_values"] is None: raise ValueError("You have to specify pixel_values") - embedding_output = self.embeddings(pixel_values) - - encoder_outputs = self.encoder( - embedding_output, + outputs = self.convnext( + pixel_values=inputs["pixel_values"], output_hidden_states=output_hidden_states, return_dict=return_dict, + training=inputs["training"], ) - - last_hidden_state = encoder_outputs[0] - - pooled_output = self.layernorm(self.pooler(last_hidden_state)) - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return TFBaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - ) + return outputs @add_start_docstrings( @@ -455,7 +533,7 @@ def __init__(self, config: ConvNextConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.convnext = TFConvNextModel(config, name="convnext") + self.convnext = TFConvNextMainLayer(config, name="convnext") # Classifier head self.classifier = tf.keras.layers.Dense( @@ -505,6 +583,11 @@ def call( >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0] >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)]) ```""" + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + inputs = input_processing( func=self.call, config=self.config, @@ -523,7 +606,10 @@ def call( raise ValueError("You have to specify pixel_values") outputs = self.convnext( - inputs["pixel_values"], output_hidden_states=output_hidden_states, return_dict=return_dict + inputs["pixel_values"], + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=inputs["training"], ) pooled_output = outputs.pooler_output if return_dict else outputs[1] From d386cf884237fe7a215901b4e748cc1daf8d18b2 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Fri, 18 Feb 2022 18:13:55 +0000 Subject: [PATCH 30/65] Fix variable naming; Test-related corrections; Run make fixup --- src/transformers/__init__.py | 10 +- src/transformers/modeling_tf_utils.py | 2 +- src/transformers/models/convnext/__init__.py | 5 +- .../models/convnext/modeling_tf_convnext.py | 131 ++++++++---------- tests/test_modeling_tf_convnext.py | 6 +- 5 files changed, 71 insertions(+), 83 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 22eceb14efabd..3224c282cdf85 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -852,9 +852,6 @@ "ConvNextForImageClassification", "ConvNextModel", "ConvNextPreTrainedModel", - "TFConvNextForImageClassification", - "TFConvNextModel", - "TFConvNextPreTrainedModel", ] ) _import_structure["models.ctrl"].extend( @@ -1717,6 +1714,13 @@ "TFConvBertPreTrainedModel", ] ) + _import_structure["models.convnext"].extend( + [ + "TFConvNextForImageClassification", + "TFConvNextModel", + "TFConvNextPreTrainedModel", + ] + ) _import_structure["models.ctrl"].extend( [ "TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST", diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 85b1accb605df..8ed0a273a518c 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -1829,7 +1829,7 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optiona super().__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size - self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range + self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range def build(self, input_shape): """ diff --git a/src/transformers/models/convnext/__init__.py b/src/transformers/models/convnext/__init__.py index 995d38f80998d..a627c462e9ba4 100644 --- a/src/transformers/models/convnext/__init__.py +++ b/src/transformers/models/convnext/__init__.py @@ -18,7 +18,7 @@ from typing import TYPE_CHECKING # rely on isort to merge the imports -from ...file_utils import _LazyModule, is_torch_available, is_tf_available, is_vision_available +from ...file_utils import _LazyModule, is_tf_available, is_torch_available, is_vision_available _import_structure = { @@ -57,6 +57,9 @@ ConvNextPreTrainedModel, ) + if is_tf_available(): + from .modeling_convnext import TFConvNextForImageClassification, TFConvNextModel, TFConvNextPreTrainedModel + else: import sys diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 1a37948647a70..2dd5f155da94a 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -69,8 +69,6 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - # note that we do not use the `base_name` here in `patch_embeddings` - # and `layernorm` self.patch_embeddings = tf.keras.layers.Conv2D( filters=config.hidden_sizes[0], kernel_size=config.patch_size, @@ -101,8 +99,8 @@ class TFConvNextLayer(tf.keras.layers.Layer): There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C, H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back - The authors used (2) as they find it slightly faster in PyTorch. Since we already permuted the inputs to - follow NHWC ordering, we can just apply the operations straight-away without the permutation. + The authors used (2) as they find it slightly faster in PyTorch. Since we already permuted the inputs to follow + NHWC ordering, we can just apply the operations straight-away without the permutation. Args: config ([`ConvNextConfig`]): Model configuration class. @@ -112,7 +110,8 @@ class TFConvNextLayer(tf.keras.layers.Layer): def __init__(self, config, dim, drop_path=0.0, **kwargs): super().__init__(**kwargs) - base_name = kwargs.get("name") + self.dim = dim + self.config = config self.dwconv = tf.keras.layers.Conv2D( filters=dim, kernel_size=7, @@ -121,19 +120,16 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs): kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", name="dwconv", - # name=f"{base_name}.dwconv", ) # depthwise conv self.layernorm = tf.keras.layers.LayerNormalization( epsilon=1e-6, name="layernorm", - # name=f"{base_name}.layernorm" ) self.pwconv1 = tf.keras.layers.Dense( units=4 * dim, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", name="pwconv1", - # name=f"{base_name}.pwconv1", ) # pointwise/1x1 convs, implemented with linear layers self.act = get_tf_activation(config.hidden_act) self.pwconv2 = tf.keras.layers.Dense( @@ -141,17 +137,6 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs): kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", name="pwconv2", - # name=f"{base_name}.pwconv2", - ) - self.layer_scale_parameter = ( - tf.Variable( - config.layer_scale_init_value * tf.ones((dim,)), - trainable=True, - name="layer_scale_parameter", - # name=f"{base_name}.layer_scale_parameter", - ) - if config.layer_scale_init_value > 0 - else None ) # Using `layers.Activation` instead of `tf.identity` to better control `training` # behaviour. @@ -159,16 +144,28 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs): TFConvNextDropPath( drop_path, name="drop_path", - # name=f"{base_name}.drop_path" ) if drop_path > 0.0 else tf.keras.layers.Activation( "linear", name="drop_path", - # name=f"{base_name}.drop_path" ) ) + def build(self, input_shape: tf.TensorShape): + # PT's `nn.Parameters` must be mapped to a TF layer weight to inherit the same name hierarchy (and vice-versa) + self.layer_scale_parameter = ( + self.add_weight( + shape=(self.dim,), + initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value), + trainable=True, + name="layer_scale_parameter", + ) + if self.config.layer_scale_init_value > 0 + else None + ) + super().build(input_shape) + def call(self, hidden_states, training=False): input = hidden_states x = self.dwconv(hidden_states) @@ -199,55 +196,46 @@ def __init__( self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs ): super().__init__(**kwargs) - base_name = kwargs.get("name") if in_channels != out_channels or stride > 1: - self.downsampling_layer = tf.keras.Sequential( - [ - tf.keras.layers.LayerNormalization( - epsilon=1e-6, - name="downsampling_layer.0", - # name=f"{base_name}/{base_name}.downsampling_layer.0", - ), - tf.keras.layers.Conv2D( - filters=out_channels, - kernel_size=kernel_size, - strides=stride, - kernel_initializer=get_initializer(config.initializer_range), - bias_initializer="zeros", - name="downsampling_layer.1", - # name=f"{base_name}/{base_name}.downsampling_layer.1", - ), - ], - ) + self.downsampling_layer = [ + tf.keras.layers.LayerNormalization( + epsilon=1e-6, + name="downsampling_layer.0", + ), + tf.keras.layers.Conv2D( + filters=out_channels, + kernel_size=kernel_size, + strides=stride, + kernel_initializer=get_initializer(config.initializer_range), + bias_initializer="zeros", + name="downsampling_layer.1", + ), + ] else: - self.downsampling_layer = tf.identity + self.downsampling_layer = [tf.identity] drop_path_rates = drop_path_rates or [0.0] * depth - self.layers = tf.keras.Sequential( - [ - *[ - TFConvNextLayer( - config, - dim=out_channels, - drop_path=drop_path_rates[j], - name=f"layers.{j}", - # name=f"{base_name}.layers.{j}" - ) - for j in range(depth) - ] - ], - ) + self.layers = [ + TFConvNextLayer( + config, + dim=out_channels, + drop_path=drop_path_rates[j], + name=f"layers.{j}", + ) + for j in range(depth) + ] def call(self, hidden_states): - hidden_states = self.downsampling_layer(hidden_states) - hidden_states = self.layers(hidden_states) + for layer in self.downsampling_layer: + hidden_states = layer(hidden_states) + for layer in self.layers: + hidden_states = layer(hidden_states) return hidden_states class TFConvNextEncoder(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - base_name = kwargs.get("name") self.stages = [] drop_path_rates = [x for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))] cur = 0 @@ -262,7 +250,6 @@ def __init__(self, config, **kwargs): depth=config.depths[i], drop_path_rates=drop_path_rates[cur], name=f"stages.{i}", - # name=f"{base_name}.stages.{i}", ) self.stages.append(stage) cur += config.depths[i] @@ -297,7 +284,6 @@ def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwa super().__init__(**kwargs) self.config = config - base_name = kwargs.get("name") self.embeddings = TFConvNextEmbeddings(config, name="embeddings") self.encoder = TFConvNextEncoder(config, name="encoder") self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") @@ -342,8 +328,6 @@ def call( ) last_hidden_state = encoder_outputs[0] - # print(f"From modeling TF: {type(self.pooler), type(self.layernorm)}") - # print(f"From modeling TF: {last_hidden_state}") pooled_output = self.layernorm(self.pooler(last_hidden_state)) if not return_dict: @@ -447,19 +431,7 @@ def serving(self, inputs): class TFConvNextModel(TFConvNextPreTrainedModel): def __init__(self, config, *inputs, add_pooling_layer=True, **kwargs): super().__init__(config, *inputs, **kwargs) - # base_name = kwargs.get("name") - # self.config = config self.convnext = TFConvNextMainLayer(config, add_pooling_layer=add_pooling_layer, name="convnext") - # # Observe the name parameter in `encoder`, `embeddings`, and `layernorm` - # # Adding `base_name` to the embeddings and layernorm adds errors. - # self.embeddings = TFConvNextEmbeddings(config, name="embeddings") - # self.encoder = TFConvNextEncoder(config, name=f"{base_name}.encoder") - - # # final layernorm layer - # self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") - - # # global average pooling - # self.pooler = tf.keras.layers.GlobalAvgPool2D() @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC) @@ -518,7 +490,16 @@ def call( return_dict=return_dict, training=inputs["training"], ) - return outputs + + # converts back NHWC -> NCHW, to match PT's output + if not return_dict: + return (tf.transpose(outputs[0], perm=(0, 3, 1, 2)),) + outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=tf.transpose(outputs.last_hidden_state, perm=(0, 3, 1, 2)), + pooler_output=outputs.pooler_output, + hidden_states=outputs.hidden_states, + ) @add_start_docstrings( diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py index 2bb05b1aed953..38665d0625031 100644 --- a/tests/test_modeling_tf_convnext.py +++ b/tests/test_modeling_tf_convnext.py @@ -14,8 +14,8 @@ # limitations under the License. """ Testing suite for the TensorFlow ConvNext model. """ -import unittest import inspect +import unittest from transformers import ConvNextConfig from transformers.file_utils import cached_property, is_tf_available, is_vision_available @@ -96,10 +96,10 @@ def get_config(self): def create_and_check_model(self, config, pixel_values, labels): model = TFConvNextModel(config=config) result = model(pixel_values, training=False) - # expected last hidden states: B, H // 32, W // 32, C + # expected last hidden states: B, C, H // 32, W // 32 self.parent.assertEqual( result.last_hidden_state.shape, - (self.batch_size, self.image_size // 32, self.image_size // 32, self.hidden_sizes[-1]), + (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32), ) def create_and_check_for_image_classification(self, config, pixel_values, labels): From 15c916f416dfc5c820528f25c0257544847ceb50 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 21 Feb 2022 16:05:38 +0530 Subject: [PATCH 31/65] chore: added Joao as a contributor to convnext. --- docs/source/model_doc/convnext.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/model_doc/convnext.mdx b/docs/source/model_doc/convnext.mdx index f2e789b36916a..4d46248565f94 100644 --- a/docs/source/model_doc/convnext.mdx +++ b/docs/source/model_doc/convnext.mdx @@ -37,8 +37,8 @@ alt="drawing" width="600"/> ConvNeXT architecture. Taken from the original paper. -This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [ariG23498](https://github.com/ariG23498) -and [sayakpaul](https://github.com/sayakpaul) (equal contribution). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt). +This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [ariG23498](https://github.com/ariG23498), +[gante](https://github.com/gante), and [sayakpaul](https://github.com/sayakpaul) (equal contribution). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt). ## ConvNeXT specific outputs From 05b8273708d5f33ff712a1325591e98bdcd285cc Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:18:41 +0530 Subject: [PATCH 32/65] rebasing --- playground.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 playground.py diff --git a/playground.py b/playground.py new file mode 100644 index 0000000000000..8a53d5babd2be --- /dev/null +++ b/playground.py @@ -0,0 +1,38 @@ +import tensorflow as tf +from transformers import AutoFeatureExtractor + +# import your TFConvNextForImageClassification class here, we will take care +# of adding the boilerplate to run `from transformers import +# TFConvNextForImageClassification` later +from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification +from transformers import ConvNextForImageClassification + +from PIL import Image + +# model = ConvNextForImageClassification.from_pretrained( +# "facebook/convnext-tiny-224", +# ) +# print(f"Model State Dict:\n") +# all_keys = list(model.state_dict().keys()) +# print([k for k in all_keys if "layer_scale" in k]) + +model = TFConvNextForImageClassification.from_pretrained( + "facebook/convnext-tiny-224", + from_pt=True, +) # notice the `from_pt` argument +print(model.summary(expand_nested=True)) + + +feature_extractor = AutoFeatureExtractor.from_pretrained( + "facebook/convnext-tiny-224" +) # don't know if this is supposed to work with TF as well, change this as needed + +image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path +inputs = feature_extractor(images=image, return_tensors="tf") + +# forward pass +outputs = model(**inputs) + +# verify the logits +assert outputs.logits.shape == [1, 1000] +tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From d247441d0a5d8e03d3959aed4e38869fbc3fe1f6 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:19:23 +0530 Subject: [PATCH 33/65] rebasing and removing playground.py. --- playground.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 playground.py diff --git a/playground.py b/playground.py deleted file mode 100644 index 8a53d5babd2be..0000000000000 --- a/playground.py +++ /dev/null @@ -1,38 +0,0 @@ -import tensorflow as tf -from transformers import AutoFeatureExtractor - -# import your TFConvNextForImageClassification class here, we will take care -# of adding the boilerplate to run `from transformers import -# TFConvNextForImageClassification` later -from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification -from transformers import ConvNextForImageClassification - -from PIL import Image - -# model = ConvNextForImageClassification.from_pretrained( -# "facebook/convnext-tiny-224", -# ) -# print(f"Model State Dict:\n") -# all_keys = list(model.state_dict().keys()) -# print([k for k in all_keys if "layer_scale" in k]) - -model = TFConvNextForImageClassification.from_pretrained( - "facebook/convnext-tiny-224", - from_pt=True, -) # notice the `from_pt` argument -print(model.summary(expand_nested=True)) - - -feature_extractor = AutoFeatureExtractor.from_pretrained( - "facebook/convnext-tiny-224" -) # don't know if this is supposed to work with TF as well, change this as needed - -image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path -inputs = feature_extractor(images=image, return_tensors="tf") - -# forward pass -outputs = model(**inputs) - -# verify the logits -assert outputs.logits.shape == [1, 1000] -tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From bb8e6c208363e837900a525e6921077e1c22ee76 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:18:41 +0530 Subject: [PATCH 34/65] rebasing --- playground.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 playground.py diff --git a/playground.py b/playground.py new file mode 100644 index 0000000000000..8a53d5babd2be --- /dev/null +++ b/playground.py @@ -0,0 +1,38 @@ +import tensorflow as tf +from transformers import AutoFeatureExtractor + +# import your TFConvNextForImageClassification class here, we will take care +# of adding the boilerplate to run `from transformers import +# TFConvNextForImageClassification` later +from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification +from transformers import ConvNextForImageClassification + +from PIL import Image + +# model = ConvNextForImageClassification.from_pretrained( +# "facebook/convnext-tiny-224", +# ) +# print(f"Model State Dict:\n") +# all_keys = list(model.state_dict().keys()) +# print([k for k in all_keys if "layer_scale" in k]) + +model = TFConvNextForImageClassification.from_pretrained( + "facebook/convnext-tiny-224", + from_pt=True, +) # notice the `from_pt` argument +print(model.summary(expand_nested=True)) + + +feature_extractor = AutoFeatureExtractor.from_pretrained( + "facebook/convnext-tiny-224" +) # don't know if this is supposed to work with TF as well, change this as needed + +image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path +inputs = feature_extractor(images=image, return_tensors="tf") + +# forward pass +outputs = model(**inputs) + +# verify the logits +assert outputs.logits.shape == [1, 1000] +tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From 3b5366d731aaf9048168534894b3d0b83a739ca6 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:19:23 +0530 Subject: [PATCH 35/65] rebasing and removing playground.py. --- playground.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 playground.py diff --git a/playground.py b/playground.py deleted file mode 100644 index 8a53d5babd2be..0000000000000 --- a/playground.py +++ /dev/null @@ -1,38 +0,0 @@ -import tensorflow as tf -from transformers import AutoFeatureExtractor - -# import your TFConvNextForImageClassification class here, we will take care -# of adding the boilerplate to run `from transformers import -# TFConvNextForImageClassification` later -from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification -from transformers import ConvNextForImageClassification - -from PIL import Image - -# model = ConvNextForImageClassification.from_pretrained( -# "facebook/convnext-tiny-224", -# ) -# print(f"Model State Dict:\n") -# all_keys = list(model.state_dict().keys()) -# print([k for k in all_keys if "layer_scale" in k]) - -model = TFConvNextForImageClassification.from_pretrained( - "facebook/convnext-tiny-224", - from_pt=True, -) # notice the `from_pt` argument -print(model.summary(expand_nested=True)) - - -feature_extractor = AutoFeatureExtractor.from_pretrained( - "facebook/convnext-tiny-224" -) # don't know if this is supposed to work with TF as well, change this as needed - -image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path -inputs = feature_extractor(images=image, return_tensors="tf") - -# forward pass -outputs = model(**inputs) - -# verify the logits -assert outputs.logits.shape == [1, 1000] -tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From 49b35cdade8d9349fbd3c8761a2b7658df1b7217 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 21 Feb 2022 20:18:45 +0530 Subject: [PATCH 36/65] chore: corrected copyright year and added comment on NHWC. --- .../models/convnext/modeling_tf_convnext.py | 132 ++++++++++++++---- tests/test_modeling_tf_convnext.py | 84 ++++++++--- 2 files changed, 174 insertions(+), 42 deletions(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 2dd5f155da94a..c003d465e0aeb 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -21,8 +21,16 @@ import tensorflow as tf from ...activations_tf import get_tf_activation -from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings -from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling, TFSequenceClassifierOutput +from ...file_utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPooling, + TFSequenceClassifierOutput, +) from ...modeling_tf_utils import ( TFModelInputType, TFPreTrainedModel, @@ -77,7 +85,9 @@ def __init__(self, config, **kwargs): kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", ) - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") + self.layernorm = tf.keras.layers.LayerNormalization( + epsilon=1e-6, name="layernorm" + ) def call(self, pixel_values): if isinstance(pixel_values, dict): @@ -157,7 +167,9 @@ def build(self, input_shape: tf.TensorShape): self.layer_scale_parameter = ( self.add_weight( shape=(self.dim,), - initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value), + initializer=tf.keras.initializers.Constant( + value=self.config.layer_scale_init_value + ), trainable=True, name="layer_scale_parameter", ) @@ -193,7 +205,15 @@ class TFConvNextStage(tf.keras.layers.Layer): """ def __init__( - self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs + self, + config, + in_channels, + out_channels, + kernel_size=2, + stride=2, + depth=2, + drop_path_rates=None, + **kwargs ): super().__init__(**kwargs) if in_channels != out_channels or stride > 1: @@ -202,11 +222,18 @@ def __init__( epsilon=1e-6, name="downsampling_layer.0", ), + # Inputs to this layer will follow NHWC format since we + # transposed the inputs from NCHW to NHWC in the `TFConvNextEmbeddings` + # layer. All the outputs throughout the model will be in NHWC + # from this point on until the output where we again change to + # NCHW. tf.keras.layers.Conv2D( filters=out_channels, kernel_size=kernel_size, strides=stride, - kernel_initializer=get_initializer(config.initializer_range), + kernel_initializer=get_initializer( + config.initializer_range + ), bias_initializer="zeros", name="downsampling_layer.1", ), @@ -237,7 +264,10 @@ class TFConvNextEncoder(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.stages = [] - drop_path_rates = [x for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))] + drop_path_rates = [ + x + for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths)) + ] cur = 0 prev_chs = config.hidden_sizes[0] for i in range(config.num_stages): @@ -268,7 +298,9 @@ def call(self, hidden_states, output_hidden_states=False, return_dict=True): all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) + return tuple( + v for v in [hidden_states, all_hidden_states] if v is not None + ) return TFBaseModelOutput( last_hidden_state=hidden_states, @@ -280,14 +312,20 @@ def call(self, hidden_states, output_hidden_states=False, return_dict=True): class TFConvNextMainLayer(tf.keras.layers.Layer): config_class = ConvNextConfig - def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs): + def __init__( + self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs + ): super().__init__(**kwargs) self.config = config self.embeddings = TFConvNextEmbeddings(config, name="embeddings") self.encoder = TFConvNextEncoder(config, name="encoder") - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") - self.pooler = tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None + self.layernorm = tf.keras.layers.LayerNormalization( + epsilon=config.layer_norm_eps, name="layernorm" + ) + self.pooler = ( + tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None + ) def call( self, @@ -298,9 +336,15 @@ def call( **kwargs, ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict + if return_dict is not None + else self.config.use_return_dict ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict inputs = input_processing( func=self.call, @@ -318,7 +362,9 @@ def call( if inputs["pixel_values"] is None: raise ValueError("You have to specify pixel_values") - embedding_output = self.embeddings(inputs["pixel_values"], training=inputs["training"]) + embedding_output = self.embeddings( + inputs["pixel_values"], training=inputs["training"] + ) encoder_outputs = self.encoder( embedding_output, @@ -359,14 +405,22 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]: `Dict[str, tf.Tensor]`: The dummy inputs. """ VISION_DUMMY_INPUTS = tf.random.uniform( - shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size), dtype=tf.float32 + shape=( + 3, + self.config.num_channels, + self.config.image_size, + self.config.image_size, + ), + dtype=tf.float32, ) return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)} @tf.function( input_signature=[ { - "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"), + "pixel_values": tf.TensorSpec( + (None, None, None, None), tf.float32, name="pixel_values" + ), } ] ) @@ -431,10 +485,14 @@ def serving(self, inputs): class TFConvNextModel(TFConvNextPreTrainedModel): def __init__(self, config, *inputs, add_pooling_layer=True, **kwargs): super().__init__(config, *inputs, **kwargs) - self.convnext = TFConvNextMainLayer(config, add_pooling_layer=add_pooling_layer, name="convnext") + self.convnext = TFConvNextMainLayer( + config, add_pooling_layer=add_pooling_layer, name="convnext" + ) @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings( + output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC + ) def call( self, pixel_values: Optional[TFModelInputType] = None, @@ -464,9 +522,15 @@ def call( >>> last_hidden_states = outputs.last_hidden_state ```""" output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict + if return_dict is not None + else self.config.use_return_dict ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict inputs = input_processing( func=self.call, @@ -496,7 +560,9 @@ def call( return (tf.transpose(outputs[0], perm=(0, 3, 1, 2)),) + outputs[1:] return TFBaseModelOutputWithPooling( - last_hidden_state=tf.transpose(outputs.last_hidden_state, perm=(0, 3, 1, 2)), + last_hidden_state=tf.transpose( + outputs.last_hidden_state, perm=(0, 3, 1, 2) + ), pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, ) @@ -509,7 +575,9 @@ def call( """, CONVNEXT_START_DOCSTRING, ) -class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClassificationLoss): +class TFConvNextForImageClassification( + TFConvNextPreTrainedModel, TFSequenceClassificationLoss +): def __init__(self, config: ConvNextConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) @@ -525,7 +593,9 @@ def __init__(self, config: ConvNextConfig, *inputs, **kwargs): ) @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings( + output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC + ) def call( self, pixel_values: Optional[TFModelInputType] = None, @@ -565,9 +635,15 @@ def call( >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)]) ```""" output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict + if return_dict is not None + else self.config.use_return_dict ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict inputs = input_processing( func=self.call, @@ -596,7 +672,11 @@ def call( pooled_output = outputs.pooler_output if return_dict else outputs[1] logits = self.classifier(pooled_output) - loss = None if inputs["labels"] is None else self.hf_compute_loss(labels=inputs["labels"], logits=logits) + loss = ( + None + if inputs["labels"] is None + else self.hf_compute_loss(labels=inputs["labels"], logits=logits) + ) if not inputs["return_dict"]: output = (logits,) + outputs[2:] diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py index 38665d0625031..ed597f92657ed 100644 --- a/tests/test_modeling_tf_convnext.py +++ b/tests/test_modeling_tf_convnext.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,11 +18,19 @@ import unittest from transformers import ConvNextConfig -from transformers.file_utils import cached_property, is_tf_available, is_vision_available +from transformers.file_utils import ( + cached_property, + is_tf_available, + is_vision_available, +) from transformers.testing_utils import require_tf, require_vision, slow from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor +from .test_modeling_tf_common import ( + TFModelTesterMixin, + floats_tensor, + ids_tensor, +) if is_tf_available(): @@ -72,11 +80,20 @@ def __init__( self.scope = scope def prepare_config_and_inputs(self): - pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + pixel_values = floats_tensor( + [ + self.batch_size, + self.num_channels, + self.image_size, + self.image_size, + ] + ) labels = None if self.use_labels: - labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + labels = ids_tensor( + [self.batch_size], self.type_sequence_label_size + ) config = self.get_config() @@ -99,14 +116,24 @@ def create_and_check_model(self, config, pixel_values, labels): # expected last hidden states: B, C, H // 32, W // 32 self.parent.assertEqual( result.last_hidden_state.shape, - (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32), + ( + self.batch_size, + self.hidden_sizes[-1], + self.image_size // 32, + self.image_size // 32, + ), ) - def create_and_check_for_image_classification(self, config, pixel_values, labels): + def create_and_check_for_image_classification( + self, config, pixel_values, labels + ): config.num_labels = self.type_sequence_label_size model = TFConvNextForImageClassification(config) result = model(pixel_values, labels=labels, training=False) - self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + self.parent.assertEqual( + result.logits.shape, + (self.batch_size, self.type_sequence_label_size), + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() @@ -138,13 +165,20 @@ class TFConvNextModelTest(TFModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = TFConvNextModelTester(self) - self.config_tester = ConfigTester(self, config_class=ConvNextConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester( + self, + config_class=ConvNextConfig, + has_text_modality=False, + hidden_size=37, + ) @unittest.skip(reason="ConvNext does not use inputs_embeds") def test_inputs_embeds(self): pass - @unittest.skip(reason="ConvNext does not support input and output embeddings") + @unittest.skip( + reason="ConvNext does not support input and output embeddings" + ) def test_model_common_attributes(self): pass @@ -173,7 +207,11 @@ def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + hidden_states = ( + outputs.encoder_hidden_states + if config.is_encoder_decoder + else outputs.hidden_states + ) expected_num_stages = self.model_tester.num_stages self.assertEqual(len(hidden_states), expected_num_stages + 1) @@ -181,10 +219,16 @@ def check_hidden_states_output(inputs_dict, config, model_class): # ConvNext's feature maps are of shape (batch_size, height, width, num_channels) in TF self.assertListEqual( list(hidden_states[0].shape[1:-1]), - [self.model_tester.image_size // 4, self.model_tester.image_size // 4], + [ + self.model_tester.image_size // 4, + self.model_tester.image_size // 4, + ], ) - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True @@ -198,7 +242,9 @@ def check_hidden_states_output(inputs_dict, config, model_class): def test_for_image_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + self.model_tester.create_and_check_for_image_classification( + *config_and_inputs + ) @slow def test_model_from_pretrained(self): @@ -218,7 +264,11 @@ class TFConvNextModelIntegrationTest(unittest.TestCase): @cached_property def default_feature_extractor(self): return ( - ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None + ConvNextFeatureExtractor.from_pretrained( + "facebook/convnext-tiny-224" + ) + if is_vision_available() + else None ) @slow @@ -241,4 +291,6 @@ def test_inference_image_classification_head(self): expected_slice = tf.constant([-0.0260, -0.4739, 0.1911]) - tf.debugging.assert_near(outputs.logits[0, :3], expected_slice, atol=1e-4) + tf.debugging.assert_near( + outputs.logits[0, :3], expected_slice, atol=1e-4 + ) From d9b507935a98db016c77e3b094d55ee20f6b70bd Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 21 Feb 2022 20:59:48 +0530 Subject: [PATCH 37/65] chore: fixed the black version and ran formatting. --- .../models/convnext/modeling_tf_convnext.py | 107 ++++-------------- tests/test_modeling_tf_convnext.py | 32 ++---- 2 files changed, 30 insertions(+), 109 deletions(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index c003d465e0aeb..950891db17730 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -85,9 +85,7 @@ def __init__(self, config, **kwargs): kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", ) - self.layernorm = tf.keras.layers.LayerNormalization( - epsilon=1e-6, name="layernorm" - ) + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") def call(self, pixel_values): if isinstance(pixel_values, dict): @@ -167,9 +165,7 @@ def build(self, input_shape: tf.TensorShape): self.layer_scale_parameter = ( self.add_weight( shape=(self.dim,), - initializer=tf.keras.initializers.Constant( - value=self.config.layer_scale_init_value - ), + initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value), trainable=True, name="layer_scale_parameter", ) @@ -205,15 +201,7 @@ class TFConvNextStage(tf.keras.layers.Layer): """ def __init__( - self, - config, - in_channels, - out_channels, - kernel_size=2, - stride=2, - depth=2, - drop_path_rates=None, - **kwargs + self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs ): super().__init__(**kwargs) if in_channels != out_channels or stride > 1: @@ -231,9 +219,7 @@ def __init__( filters=out_channels, kernel_size=kernel_size, strides=stride, - kernel_initializer=get_initializer( - config.initializer_range - ), + kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", name="downsampling_layer.1", ), @@ -264,10 +250,7 @@ class TFConvNextEncoder(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.stages = [] - drop_path_rates = [ - x - for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths)) - ] + drop_path_rates = [x for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))] cur = 0 prev_chs = config.hidden_sizes[0] for i in range(config.num_stages): @@ -298,9 +281,7 @@ def call(self, hidden_states, output_hidden_states=False, return_dict=True): all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: - return tuple( - v for v in [hidden_states, all_hidden_states] if v is not None - ) + return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) return TFBaseModelOutput( last_hidden_state=hidden_states, @@ -312,20 +293,14 @@ def call(self, hidden_states, output_hidden_states=False, return_dict=True): class TFConvNextMainLayer(tf.keras.layers.Layer): config_class = ConvNextConfig - def __init__( - self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs - ): + def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs): super().__init__(**kwargs) self.config = config self.embeddings = TFConvNextEmbeddings(config, name="embeddings") self.encoder = TFConvNextEncoder(config, name="encoder") - self.layernorm = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="layernorm" - ) - self.pooler = ( - tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None - ) + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + self.pooler = tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None def call( self, @@ -336,15 +311,9 @@ def call( **kwargs, ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict = ( - return_dict - if return_dict is not None - else self.config.use_return_dict + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict inputs = input_processing( func=self.call, @@ -362,9 +331,7 @@ def call( if inputs["pixel_values"] is None: raise ValueError("You have to specify pixel_values") - embedding_output = self.embeddings( - inputs["pixel_values"], training=inputs["training"] - ) + embedding_output = self.embeddings(inputs["pixel_values"], training=inputs["training"]) encoder_outputs = self.encoder( embedding_output, @@ -418,9 +385,7 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]: @tf.function( input_signature=[ { - "pixel_values": tf.TensorSpec( - (None, None, None, None), tf.float32, name="pixel_values" - ), + "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"), } ] ) @@ -485,14 +450,10 @@ def serving(self, inputs): class TFConvNextModel(TFConvNextPreTrainedModel): def __init__(self, config, *inputs, add_pooling_layer=True, **kwargs): super().__init__(config, *inputs, **kwargs) - self.convnext = TFConvNextMainLayer( - config, add_pooling_layer=add_pooling_layer, name="convnext" - ) + self.convnext = TFConvNextMainLayer(config, add_pooling_layer=add_pooling_layer, name="convnext") @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING) - @replace_return_docstrings( - output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC - ) + @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC) def call( self, pixel_values: Optional[TFModelInputType] = None, @@ -522,15 +483,9 @@ def call( >>> last_hidden_states = outputs.last_hidden_state ```""" output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict = ( - return_dict - if return_dict is not None - else self.config.use_return_dict + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict inputs = input_processing( func=self.call, @@ -560,9 +515,7 @@ def call( return (tf.transpose(outputs[0], perm=(0, 3, 1, 2)),) + outputs[1:] return TFBaseModelOutputWithPooling( - last_hidden_state=tf.transpose( - outputs.last_hidden_state, perm=(0, 3, 1, 2) - ), + last_hidden_state=tf.transpose(outputs.last_hidden_state, perm=(0, 3, 1, 2)), pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, ) @@ -575,9 +528,7 @@ def call( """, CONVNEXT_START_DOCSTRING, ) -class TFConvNextForImageClassification( - TFConvNextPreTrainedModel, TFSequenceClassificationLoss -): +class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClassificationLoss): def __init__(self, config: ConvNextConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) @@ -593,9 +544,7 @@ def __init__(self, config: ConvNextConfig, *inputs, **kwargs): ) @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING) - @replace_return_docstrings( - output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC - ) + @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) def call( self, pixel_values: Optional[TFModelInputType] = None, @@ -635,15 +584,9 @@ def call( >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)]) ```""" output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict = ( - return_dict - if return_dict is not None - else self.config.use_return_dict + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict inputs = input_processing( func=self.call, @@ -672,11 +615,7 @@ def call( pooled_output = outputs.pooler_output if return_dict else outputs[1] logits = self.classifier(pooled_output) - loss = ( - None - if inputs["labels"] is None - else self.hf_compute_loss(labels=inputs["labels"], logits=logits) - ) + loss = None if inputs["labels"] is None else self.hf_compute_loss(labels=inputs["labels"], logits=logits) if not inputs["return_dict"]: output = (logits,) + outputs[2:] diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py index ed597f92657ed..df7c6278d8038 100644 --- a/tests/test_modeling_tf_convnext.py +++ b/tests/test_modeling_tf_convnext.py @@ -91,9 +91,7 @@ def prepare_config_and_inputs(self): labels = None if self.use_labels: - labels = ids_tensor( - [self.batch_size], self.type_sequence_label_size - ) + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) config = self.get_config() @@ -124,9 +122,7 @@ def create_and_check_model(self, config, pixel_values, labels): ), ) - def create_and_check_for_image_classification( - self, config, pixel_values, labels - ): + def create_and_check_for_image_classification(self, config, pixel_values, labels): config.num_labels = self.type_sequence_label_size model = TFConvNextForImageClassification(config) result = model(pixel_values, labels=labels, training=False) @@ -176,9 +172,7 @@ def setUp(self): def test_inputs_embeds(self): pass - @unittest.skip( - reason="ConvNext does not support input and output embeddings" - ) + @unittest.skip(reason="ConvNext does not support input and output embeddings") def test_model_common_attributes(self): pass @@ -207,11 +201,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - hidden_states = ( - outputs.encoder_hidden_states - if config.is_encoder_decoder - else outputs.hidden_states - ) + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states expected_num_stages = self.model_tester.num_stages self.assertEqual(len(hidden_states), expected_num_stages + 1) @@ -242,9 +232,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): def test_for_image_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_for_image_classification( - *config_and_inputs - ) + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) @slow def test_model_from_pretrained(self): @@ -264,11 +252,7 @@ class TFConvNextModelIntegrationTest(unittest.TestCase): @cached_property def default_feature_extractor(self): return ( - ConvNextFeatureExtractor.from_pretrained( - "facebook/convnext-tiny-224" - ) - if is_vision_available() - else None + ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None ) @slow @@ -291,6 +275,4 @@ def test_inference_image_classification_head(self): expected_slice = tf.constant([-0.0260, -0.4739, 0.1911]) - tf.debugging.assert_near( - outputs.logits[0, :3], expected_slice, atol=1e-4 - ) + tf.debugging.assert_near(outputs.logits[0, :3], expected_slice, atol=1e-4) From 4b4737f53693f476352a8848de7683138c3ff8aa Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 21 Feb 2022 22:46:29 +0530 Subject: [PATCH 38/65] chore: ran make style. --- .../models/convnext/modeling_tf_convnext.py | 12 ++---------- tests/test_modeling_tf_convnext.py | 12 ++---------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 950891db17730..328194dddbc2c 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -21,16 +21,8 @@ import tensorflow as tf from ...activations_tf import get_tf_activation -from ...file_utils import ( - add_start_docstrings, - add_start_docstrings_to_model_forward, - replace_return_docstrings, -) -from ...modeling_tf_outputs import ( - TFBaseModelOutput, - TFBaseModelOutputWithPooling, - TFSequenceClassifierOutput, -) +from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings +from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling, TFSequenceClassifierOutput from ...modeling_tf_utils import ( TFModelInputType, TFPreTrainedModel, diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py index df7c6278d8038..52cbf02f5a549 100644 --- a/tests/test_modeling_tf_convnext.py +++ b/tests/test_modeling_tf_convnext.py @@ -18,19 +18,11 @@ import unittest from transformers import ConvNextConfig -from transformers.file_utils import ( - cached_property, - is_tf_available, - is_vision_available, -) +from transformers.file_utils import cached_property, is_tf_available, is_vision_available from transformers.testing_utils import require_tf, require_vision, slow from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import ( - TFModelTesterMixin, - floats_tensor, - ids_tensor, -) +from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor if is_tf_available(): From 2322a5f4f660c82fd3a1afe8fd6fdb7e44ab5c2f Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 22 Feb 2022 10:27:25 +0530 Subject: [PATCH 39/65] chore: removed from_pt argument from test, ran make style. --- tests/test_modeling_tf_convnext.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py index 52cbf02f5a549..233ec6662b820 100644 --- a/tests/test_modeling_tf_convnext.py +++ b/tests/test_modeling_tf_convnext.py @@ -249,10 +249,7 @@ def default_feature_extractor(self): @slow def test_inference_image_classification_head(self): - model = TFConvNextForImageClassification.from_pretrained( - "facebook/convnext-tiny-224", - from_pt=True, - ) + model = TFConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224") feature_extractor = self.default_feature_extractor image = prepare_img() From 61ae121e451d52e72d66adaf511d65af835c4c8e Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:18:41 +0530 Subject: [PATCH 40/65] rebasing --- playground.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 playground.py diff --git a/playground.py b/playground.py new file mode 100644 index 0000000000000..8a53d5babd2be --- /dev/null +++ b/playground.py @@ -0,0 +1,38 @@ +import tensorflow as tf +from transformers import AutoFeatureExtractor + +# import your TFConvNextForImageClassification class here, we will take care +# of adding the boilerplate to run `from transformers import +# TFConvNextForImageClassification` later +from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification +from transformers import ConvNextForImageClassification + +from PIL import Image + +# model = ConvNextForImageClassification.from_pretrained( +# "facebook/convnext-tiny-224", +# ) +# print(f"Model State Dict:\n") +# all_keys = list(model.state_dict().keys()) +# print([k for k in all_keys if "layer_scale" in k]) + +model = TFConvNextForImageClassification.from_pretrained( + "facebook/convnext-tiny-224", + from_pt=True, +) # notice the `from_pt` argument +print(model.summary(expand_nested=True)) + + +feature_extractor = AutoFeatureExtractor.from_pretrained( + "facebook/convnext-tiny-224" +) # don't know if this is supposed to work with TF as well, change this as needed + +image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path +inputs = feature_extractor(images=image, return_tensors="tf") + +# forward pass +outputs = model(**inputs) + +# verify the logits +assert outputs.logits.shape == [1, 1000] +tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From b5683772e3ec8ac03cd23e50ffa17ba18f983b58 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:19:23 +0530 Subject: [PATCH 41/65] rebasing and removing playground.py. --- playground.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 playground.py diff --git a/playground.py b/playground.py deleted file mode 100644 index 8a53d5babd2be..0000000000000 --- a/playground.py +++ /dev/null @@ -1,38 +0,0 @@ -import tensorflow as tf -from transformers import AutoFeatureExtractor - -# import your TFConvNextForImageClassification class here, we will take care -# of adding the boilerplate to run `from transformers import -# TFConvNextForImageClassification` later -from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification -from transformers import ConvNextForImageClassification - -from PIL import Image - -# model = ConvNextForImageClassification.from_pretrained( -# "facebook/convnext-tiny-224", -# ) -# print(f"Model State Dict:\n") -# all_keys = list(model.state_dict().keys()) -# print([k for k in all_keys if "layer_scale" in k]) - -model = TFConvNextForImageClassification.from_pretrained( - "facebook/convnext-tiny-224", - from_pt=True, -) # notice the `from_pt` argument -print(model.summary(expand_nested=True)) - - -feature_extractor = AutoFeatureExtractor.from_pretrained( - "facebook/convnext-tiny-224" -) # don't know if this is supposed to work with TF as well, change this as needed - -image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path -inputs = feature_extractor(images=image, return_tensors="tf") - -# forward pass -outputs = model(**inputs) - -# verify the logits -assert outputs.logits.shape == [1, 1000] -tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From 1259bf8b37e08ab0f05f46991c6e0571a761027f Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:18:41 +0530 Subject: [PATCH 42/65] rebasing --- playground.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 playground.py diff --git a/playground.py b/playground.py new file mode 100644 index 0000000000000..8a53d5babd2be --- /dev/null +++ b/playground.py @@ -0,0 +1,38 @@ +import tensorflow as tf +from transformers import AutoFeatureExtractor + +# import your TFConvNextForImageClassification class here, we will take care +# of adding the boilerplate to run `from transformers import +# TFConvNextForImageClassification` later +from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification +from transformers import ConvNextForImageClassification + +from PIL import Image + +# model = ConvNextForImageClassification.from_pretrained( +# "facebook/convnext-tiny-224", +# ) +# print(f"Model State Dict:\n") +# all_keys = list(model.state_dict().keys()) +# print([k for k in all_keys if "layer_scale" in k]) + +model = TFConvNextForImageClassification.from_pretrained( + "facebook/convnext-tiny-224", + from_pt=True, +) # notice the `from_pt` argument +print(model.summary(expand_nested=True)) + + +feature_extractor = AutoFeatureExtractor.from_pretrained( + "facebook/convnext-tiny-224" +) # don't know if this is supposed to work with TF as well, change this as needed + +image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path +inputs = feature_extractor(images=image, return_tensors="tf") + +# forward pass +outputs = model(**inputs) + +# verify the logits +assert outputs.logits.shape == [1, 1000] +tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From 96c1ea4e9a7249bf0a1e8cbf6d682e1c82e9cd24 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:19:23 +0530 Subject: [PATCH 43/65] rebasing and removing playground.py. --- playground.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 playground.py diff --git a/playground.py b/playground.py deleted file mode 100644 index 8a53d5babd2be..0000000000000 --- a/playground.py +++ /dev/null @@ -1,38 +0,0 @@ -import tensorflow as tf -from transformers import AutoFeatureExtractor - -# import your TFConvNextForImageClassification class here, we will take care -# of adding the boilerplate to run `from transformers import -# TFConvNextForImageClassification` later -from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification -from transformers import ConvNextForImageClassification - -from PIL import Image - -# model = ConvNextForImageClassification.from_pretrained( -# "facebook/convnext-tiny-224", -# ) -# print(f"Model State Dict:\n") -# all_keys = list(model.state_dict().keys()) -# print([k for k in all_keys if "layer_scale" in k]) - -model = TFConvNextForImageClassification.from_pretrained( - "facebook/convnext-tiny-224", - from_pt=True, -) # notice the `from_pt` argument -print(model.summary(expand_nested=True)) - - -feature_extractor = AutoFeatureExtractor.from_pretrained( - "facebook/convnext-tiny-224" -) # don't know if this is supposed to work with TF as well, change this as needed - -image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path -inputs = feature_extractor(images=image, return_tensors="tf") - -# forward pass -outputs = model(**inputs) - -# verify the logits -assert outputs.logits.shape == [1, 1000] -tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From b1972164638903e57f0a8343d04924753e1ecf6c Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 24 Feb 2022 12:22:50 +0530 Subject: [PATCH 44/65] fix: tests in the convnext subclass, ran make style. --- tests/test_modeling_tf_common.py | 419 +++++++++++++++++++++++------ tests/test_modeling_tf_convnext.py | 45 ++++ 2 files changed, 378 insertions(+), 86 deletions(-) diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index e072b4febd90b..2038f29e56cf8 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -83,7 +83,8 @@ # Restrict TensorFlow to only allocate x GB of memory on the GPUs try: tf.config.set_logical_device_configuration( - gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)] + gpu, + [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)], ) logical_gpus = tf.config.list_logical_devices("GPU") print("Logical GPUs", logical_gpus) @@ -116,7 +117,10 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): inputs_dict = { - k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1)) + k: tf.tile( + tf.expand_dims(v, 1), + (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1), + ) if isinstance(v, tf.Tensor) and v.ndim > 0 else v for k, v in inputs_dict.items() @@ -144,7 +148,11 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING), ]: inputs_dict["labels"] = tf.zeros( - (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32 + ( + self.model_tester.batch_size, + self.model_tester.seq_length, + ), + dtype=tf.int32, ) return inputs_dict @@ -152,7 +160,10 @@ def test_initialization(self): pass def test_save_load(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -166,7 +177,10 @@ def test_save_load(self): self.assert_outputs_same(after_outputs, outputs) def test_save_load_config(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -218,7 +232,10 @@ def test_onnx_compliancy(self): if not self.test_onnx: return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() INTERNAL_OPS = [ "Assert", "AssignVariableOp", @@ -265,7 +282,10 @@ def test_onnx_runtime_optimize(self): import onnxruntime import tf2onnx - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -276,7 +296,10 @@ def test_onnx_runtime_optimize(self): onnxruntime.InferenceSession(onnx_model_proto.SerializeToString()) def test_keras_save_load(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() tf_main_layer_classes = set( module_member @@ -321,7 +344,8 @@ def test_keras_save_load(self): ) else: model = tf.keras.models.load_model( - filepath, custom_objects={main_layer_class.__name__: main_layer_class} + filepath, + custom_objects={main_layer_class.__name__: main_layer_class}, ) assert isinstance(model, tf.keras.Model) after_outputs = model(inputs_dict) @@ -348,7 +372,10 @@ def test_pt_tf_model_equivalence(self): import transformers - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beginning @@ -361,7 +388,9 @@ def test_pt_tf_model_equivalence(self): # Check we can load pt model in tf and vice-versa with model => model functions tf_model = transformers.load_pytorch_model_in_tf2_model( - tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class) + tf_model, + pt_model, + tf_inputs=self._prepare_for_class(inputs_dict, model_class), ) pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) @@ -382,7 +411,10 @@ def test_pt_tf_model_equivalence(self): with torch.no_grad(): pto = pt_model(**pt_inputs_dict) - tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False) + tfo = tf_model( + self._prepare_for_class(inputs_dict, model_class), + training=False, + ) tf_hidden_states = tfo[0].numpy() pt_hidden_states = pto[0].numpy() @@ -441,14 +473,20 @@ def test_pt_tf_model_equivalence(self): self.assertLessEqual(max_diff, 4e-2) def test_compile_tf_model(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() max_input = getattr(self.model_tester, "max_position_embeddings", 512) optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") for model_class in self.all_model_classes: - if model_class.__name__ in ["TFSpeech2TextModel", "TFSpeech2TextForConditionalGeneration"]: + if model_class.__name__ in [ + "TFSpeech2TextModel", + "TFSpeech2TextForConditionalGeneration", + ]: inputs = { "decoder_input_ids": tf.keras.Input( batch_shape=(2, max_input), @@ -472,7 +510,11 @@ def test_compile_tf_model(self): name="decoder_input_ids", dtype="int32", ), - "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"), + "input_ids": tf.keras.Input( + batch_shape=(2, max_input), + name="input_ids", + dtype="int32", + ), } # `pixel_values` implies that the input is an image elif model_class.main_input_name == "pixel_values": @@ -488,7 +530,11 @@ def test_compile_tf_model(self): ) elif model_class.__name__ in ["TFCLIPModel"]: inputs = { - "input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"), + "input_ids": tf.keras.Input( + batch_shape=(3, max_input), + name="input_ids", + dtype="int32", + ), "pixel_values": tf.keras.Input( batch_shape=( 3, @@ -501,7 +547,11 @@ def test_compile_tf_model(self): ), } elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): - inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32") + inputs = tf.keras.Input( + batch_shape=(4, 2, max_input), + name="input_ids", + dtype="int32", + ) else: inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32") @@ -524,7 +574,10 @@ def test_compile_tf_model(self): extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) def test_keyword_and_dict_args(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -540,10 +593,21 @@ def test_keyword_and_dict_args(self): self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6) def test_attention_outputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True - decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length) - encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length) + decoder_seq_length = getattr( + self.model_tester, + "decoder_seq_length", + self.model_tester.seq_length, + ) + encoder_seq_length = getattr( + self.model_tester, + "encoder_seq_length", + self.model_tester.seq_length, + ) decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length) encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) @@ -554,7 +618,11 @@ def check_decoder_attentions_output(outputs): self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(decoder_attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + decoder_key_length, + ], ) def check_encoder_attentions_output(outputs): @@ -564,7 +632,11 @@ def check_encoder_attentions_output(outputs): self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + [ + self.model_tester.num_attention_heads, + encoder_seq_length, + encoder_key_length, + ], ) for model_class in self.all_model_classes: @@ -606,7 +678,10 @@ def test_headmasking(self): return random.Random().seed(42) - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() random.Random().seed() inputs_dict["output_attentions"] = True @@ -619,11 +694,19 @@ def test_headmasking(self): def prepare_layer_head_mask(i, attention_heads, num_hidden_layers): if i == 0: return tf.concat( - (tf.zeros(1, dtype=tf.float32), tf.ones(attention_heads - 1, dtype=tf.float32)), 0 + ( + tf.zeros(1, dtype=tf.float32), + tf.ones(attention_heads - 1, dtype=tf.float32), + ), + 0, ) elif i == num_hidden_layers - 1: return tf.concat( - (tf.zeros(attention_heads - 1, dtype=tf.float32), tf.ones(1, dtype=tf.float32)), 0 + ( + tf.zeros(attention_heads - 1, dtype=tf.float32), + tf.ones(1, dtype=tf.float32), + ), + 0, ) else: return tf.ones(attention_heads, dtype=tf.float32) @@ -652,7 +735,8 @@ def check_attentions_validity(attentions): # Remove Nan for t in attentions: self.assertLess( - (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), (tf.size(t) / 4).numpy() + (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), + (tf.size(t) / 4).numpy(), ) # Check we don't have more than 25% nans (arbitrary) attentions = [ @@ -660,11 +744,23 @@ def check_attentions_validity(attentions): ] # remove them (the test is less complete) self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0) - self.assertNotEqual(tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), 0.0) + self.assertNotEqual( + tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), + 0.0, + ) if len(attentions) > 2: # encoder-decodere models have only 2 layers in each modules - self.assertNotEqual(tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), 0.0) - self.assertAlmostEqual(tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), 0.0) - self.assertNotEqual(tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), 0.0) + self.assertNotEqual( + tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), + 0.0, + ) + self.assertAlmostEqual( + tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), + 0.0, + ) + self.assertNotEqual( + tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), + 0.0, + ) if model.config.is_encoder_decoder: check_attentions_validity(outputs.encoder_attentions) @@ -675,13 +771,18 @@ def check_attentions_validity(attentions): check_attentions_validity(outputs.attentions) def test_hidden_states_output(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() def check_hidden_states_output(config, inputs_dict, model_class): model = model_class(config) outputs = model(self._prepare_for_class(inputs_dict, model_class)) expected_num_layers = getattr( - self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + self.model_tester, + "expected_num_hidden_layers", + self.model_tester.num_hidden_layers + 1, ) if model.config.is_encoder_decoder: @@ -692,12 +793,18 @@ def check_hidden_states_output(config, inputs_dict, model_class): self.assertEqual(len(encoder_hidden_states), expected_num_layers) self.assertListEqual( list(encoder_hidden_states[0].shape[-2:]), - [self.model_tester.seq_length, self.model_tester.hidden_size], + [ + self.model_tester.seq_length, + self.model_tester.hidden_size, + ], ) self.assertEqual(len(decoder_hidden_states), expected_num_layers) self.assertListEqual( list(decoder_hidden_states[0].shape[-2:]), - [self.model_tester.seq_length, self.model_tester.hidden_size], + [ + self.model_tester.seq_length, + self.model_tester.hidden_size, + ], ) else: hidden_states = outputs.hidden_states @@ -705,7 +812,10 @@ def check_hidden_states_output(config, inputs_dict, model_class): self.assertEqual(len(hidden_states), expected_num_layers) self.assertListEqual( list(hidden_states[0].shape[-2:]), - [self.model_tester.seq_length, self.model_tester.hidden_size], + [ + self.model_tester.seq_length, + self.model_tester.hidden_size, + ], ) for model_class in self.all_model_classes: @@ -717,7 +827,10 @@ def check_hidden_states_output(config, inputs_dict, model_class): check_hidden_states_output(config, inputs_dict, model_class) def test_model_common_attributes(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() text_in_text_out_models = ( get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING) + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING) @@ -747,13 +860,22 @@ def test_model_common_attributes(self): assert name is None def test_determinism(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) first, second = ( - model(self._prepare_for_class(inputs_dict, model_class), training=False)[0], - model(self._prepare_for_class(inputs_dict, model_class), training=False)[0], + model( + self._prepare_for_class(inputs_dict, model_class), + training=False, + )[0], + model( + self._prepare_for_class(inputs_dict, model_class), + training=False, + )[0], ) out_1 = first.numpy() out_2 = second.numpy() @@ -764,7 +886,10 @@ def test_determinism(self): def test_model_outputs_equivalence(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs) @@ -799,30 +924,32 @@ def recursive_check(tuple_object, dict_object): dict_inputs = self._prepare_for_class(inputs_dict, model_class) check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) - # Pure conv models (such as ConvNeXt) don't have `output_attentions`. - if config.output_attentions: - tuple_inputs = self._prepare_for_class(inputs_dict, model_class) - dict_inputs = self._prepare_for_class(inputs_dict, model_class) - check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) - if config.output_attentions: - tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) - if config.output_attentions: - tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - check_equivalence( - model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} - ) + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence( + model, + tuple_inputs, + dict_inputs, + {"output_hidden_states": True, "output_attentions": True}, + ) def test_inputs_embeds(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -849,7 +976,10 @@ def test_inputs_embeds(self): model(inputs) def test_numpy_arrays_inputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() def prepare_numpy_arrays(inputs_dict): inputs_np_dict = {} @@ -874,7 +1004,10 @@ def prepare_numpy_arrays(inputs_dict): def test_resize_token_embeddings(self): if not self.test_resize_embeddings: return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() def _get_word_embedding_weight(model, embedding_layer): embeds = getattr(embedding_layer, "weight", None) @@ -933,16 +1066,25 @@ def _get_word_embedding_weight(model, embedding_layer): if old_output_embeddings is not None and new_output_embeddings is not None: self.assertEqual(new_output_embeddings.shape[0], assert_size) - self.assertEqual(new_output_embeddings.shape[1], old_output_embeddings.shape[1]) + self.assertEqual( + new_output_embeddings.shape[1], + old_output_embeddings.shape[1], + ) models_equal = True - for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()): + for p1, p2 in zip( + old_output_embeddings.value(), + new_output_embeddings.value(), + ): if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: models_equal = False self.assertTrue(models_equal) def test_lm_head_model_random_no_beam_search_generate(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) # iterate over all generative models @@ -969,16 +1111,25 @@ def test_lm_head_model_random_no_beam_search_generate(self): # check bad words tokens language generation # create list of 1-seq bad token and list of 2-seq of bad tokens - bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)] + bad_words_ids = [ + self._generate_random_bad_tokens(1, model), + self._generate_random_bad_tokens(2, model), + ] output_tokens = model.generate( - input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2 + input_ids, + do_sample=True, + bad_words_ids=bad_words_ids, + num_return_sequences=2, ) # only count generated tokens generated_ids = output_tokens[:, input_ids.shape[-1] :] self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) def test_lm_head_model_no_beam_search_generate_dict_outputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) if input_ids is None: input_ids = inputs_dict.get("input_features", None) @@ -1011,7 +1162,10 @@ def test_lm_head_model_no_beam_search_generate_dict_outputs(self): self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput) def test_lm_head_model_random_beam_search_generate(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) for model_class in self.all_generative_model_classes: @@ -1026,7 +1180,12 @@ def test_lm_head_model_random_beam_search_generate(self): with self.assertRaises(AssertionError): # generating more sequences than having beams leads is not possible - model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2) + model.generate( + input_ids, + do_sample=False, + num_return_sequences=3, + num_beams=2, + ) # num_return_sequences > 1, sample self._check_generated_ids( @@ -1038,20 +1197,37 @@ def test_lm_head_model_random_beam_search_generate(self): ) ) # num_return_sequences > 1, greedy - self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2)) + self._check_generated_ids( + model.generate( + input_ids, + do_sample=False, + num_beams=2, + num_return_sequences=2, + ) + ) # check bad words tokens language generation # create list of 1-seq bad token and list of 2-seq of bad tokens - bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)] + bad_words_ids = [ + self._generate_random_bad_tokens(1, model), + self._generate_random_bad_tokens(2, model), + ] output_tokens = model.generate( - input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2 + input_ids, + do_sample=False, + bad_words_ids=bad_words_ids, + num_beams=2, + num_return_sequences=2, ) # only count generated tokens generated_ids = output_tokens[:, input_ids.shape[-1] :] self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) def test_lm_head_model_beam_search_generate_dict_outputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) if input_ids is None: input_ids = inputs_dict.get("input_features", None) @@ -1086,14 +1262,20 @@ def test_lm_head_model_beam_search_generate_dict_outputs(self): self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput) def test_loss_computation(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) if getattr(model, "hf_compute_loss", None): # The number of elements in the loss should be the same as the number of elements in the label prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) added_label = prepared_for_class[ - sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0] + sorted( + list(prepared_for_class.keys() - inputs_dict.keys()), + reverse=True, + )[0] ] loss_size = tf.size(added_label) @@ -1104,7 +1286,11 @@ def test_loss_computation(self): # Test that model correctly compute the loss with kwargs prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) - possible_input_names = {"input_ids", "pixel_values", "input_features"} + possible_input_names = { + "input_ids", + "pixel_values", + "input_features", + } input_name = possible_input_names.intersection(set(prepared_for_class)).pop() model_input = prepared_for_class.pop(input_name) @@ -1148,8 +1334,15 @@ def test_loss_computation(self): self.assertEqual(loss.shape, [loss_size]) def test_generate_with_headmasking(self): - attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"] - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + attention_names = [ + "encoder_attentions", + "decoder_attentions", + "cross_attentions", + ] + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_generative_model_classes: model = model_class(config) @@ -1184,7 +1377,10 @@ def test_generate_with_headmasking(self): def test_load_with_mismatched_shapes(self): if not self.test_mismatched_shapes: return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): @@ -1291,7 +1487,13 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): def random_attention_mask(shape, rng=None, name=None, dtype=None): attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype) # make sure that at least one token is attended to for each batch - attn_mask = tf.concat([tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), attn_mask[:, 1:]], axis=1) + attn_mask = tf.concat( + [ + tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), + attn_mask[:, 1:], + ], + axis=1, + ) return attn_mask @@ -1308,7 +1510,10 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None): for _ in range(total_dims): values.append(rng.random() * scale) - return tf.reshape(tf.constant(values, dtype=dtype if dtype is not None else tf.float32), shape=shape) + return tf.reshape( + tf.constant(values, dtype=dtype if dtype is not None else tf.float32), + shape=shape, + ) @require_tf @@ -1387,12 +1592,34 @@ def test_top_k_top_p_filtering(self): ) non_inf_expected_idx = tf.convert_to_tensor( - [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]], + [ + [0, 0], + [0, 9], + [0, 10], + [0, 25], + [0, 26], + [1, 13], + [1, 17], + [1, 18], + [1, 20], + [1, 27], + ], dtype=tf.int32, ) # expected non filtered idx as noted above non_inf_expected_output = tf.convert_to_tensor( - [8.222099, 7.3534126, 8.432078, 7.4402075, 9.38451, 6.271159, 8.827531, 5.4402995, 7.3857956, 9.677023], + [ + 8.222099, + 7.3534126, + 8.432078, + 7.4402075, + 9.38451, + 6.271159, + 8.827531, + 5.4402995, + 7.3857956, + 9.677023, + ], dtype=tf.float32, ) # expected non filtered values as noted above @@ -1423,19 +1650,31 @@ def tearDownClass(cls): pass try: - delete_repo(token=cls._token, name="test-model-tf-org", organization="valid_org") + delete_repo( + token=cls._token, + name="test-model-tf-org", + organization="valid_org", + ) except HTTPError: pass def test_push_to_hub(self): config = BertConfig( - vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, ) model = TFBertModel(config) # Make sure model is properly initialized _ = model(model.dummy_inputs) with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained(os.path.join(tmp_dir, "test-model-tf"), push_to_hub=True, use_auth_token=self._token) + model.save_pretrained( + os.path.join(tmp_dir, "test-model-tf"), + push_to_hub=True, + use_auth_token=self._token, + ) new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf") models_equal = True @@ -1446,7 +1685,11 @@ def test_push_to_hub(self): def test_push_to_hub_with_model_card(self): config = BertConfig( - vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, ) model = TFBertModel(config) with tempfile.TemporaryDirectory() as tmp_dir: @@ -1455,7 +1698,11 @@ def test_push_to_hub_with_model_card(self): def test_push_to_hub_in_organization(self): config = BertConfig( - vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, ) model = TFBertModel(config) with tempfile.TemporaryDirectory() as tmp_dir: diff --git a/tests/test_modeling_tf_convnext.py b/tests/test_modeling_tf_convnext.py index 233ec6662b820..6f8c142b654d8 100644 --- a/tests/test_modeling_tf_convnext.py +++ b/tests/test_modeling_tf_convnext.py @@ -16,6 +16,7 @@ import inspect import unittest +from typing import List, Tuple from transformers import ConvNextConfig from transformers.file_utils import cached_property, is_tf_available, is_vision_available @@ -222,6 +223,50 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) + # Since ConvNext does not have any attention we need to rewrite this test. + def test_model_outputs_equivalence(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (List, Tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + all(tf.equal(tuple_object, dict_object)), + msg=f"Tuple and dict output are not equal. Difference: {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}", + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + model = model_class(config) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + def test_for_image_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_image_classification(*config_and_inputs) From 7dcd98a346e91803660d1bb3ee0f4f6b8bb28bd2 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:18:41 +0530 Subject: [PATCH 45/65] rebasing --- playground.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 playground.py diff --git a/playground.py b/playground.py new file mode 100644 index 0000000000000..8a53d5babd2be --- /dev/null +++ b/playground.py @@ -0,0 +1,38 @@ +import tensorflow as tf +from transformers import AutoFeatureExtractor + +# import your TFConvNextForImageClassification class here, we will take care +# of adding the boilerplate to run `from transformers import +# TFConvNextForImageClassification` later +from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification +from transformers import ConvNextForImageClassification + +from PIL import Image + +# model = ConvNextForImageClassification.from_pretrained( +# "facebook/convnext-tiny-224", +# ) +# print(f"Model State Dict:\n") +# all_keys = list(model.state_dict().keys()) +# print([k for k in all_keys if "layer_scale" in k]) + +model = TFConvNextForImageClassification.from_pretrained( + "facebook/convnext-tiny-224", + from_pt=True, +) # notice the `from_pt` argument +print(model.summary(expand_nested=True)) + + +feature_extractor = AutoFeatureExtractor.from_pretrained( + "facebook/convnext-tiny-224" +) # don't know if this is supposed to work with TF as well, change this as needed + +image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path +inputs = feature_extractor(images=image, return_tensors="tf") + +# forward pass +outputs = model(**inputs) + +# verify the logits +assert outputs.logits.shape == [1, 1000] +tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From 95fffedb65595409a2cc12e25bdb876985bc8452 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:19:23 +0530 Subject: [PATCH 46/65] rebasing and removing playground.py. --- playground.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 playground.py diff --git a/playground.py b/playground.py deleted file mode 100644 index 8a53d5babd2be..0000000000000 --- a/playground.py +++ /dev/null @@ -1,38 +0,0 @@ -import tensorflow as tf -from transformers import AutoFeatureExtractor - -# import your TFConvNextForImageClassification class here, we will take care -# of adding the boilerplate to run `from transformers import -# TFConvNextForImageClassification` later -from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification -from transformers import ConvNextForImageClassification - -from PIL import Image - -# model = ConvNextForImageClassification.from_pretrained( -# "facebook/convnext-tiny-224", -# ) -# print(f"Model State Dict:\n") -# all_keys = list(model.state_dict().keys()) -# print([k for k in all_keys if "layer_scale" in k]) - -model = TFConvNextForImageClassification.from_pretrained( - "facebook/convnext-tiny-224", - from_pt=True, -) # notice the `from_pt` argument -print(model.summary(expand_nested=True)) - - -feature_extractor = AutoFeatureExtractor.from_pretrained( - "facebook/convnext-tiny-224" -) # don't know if this is supposed to work with TF as well, change this as needed - -image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path -inputs = feature_extractor(images=image, return_tensors="tf") - -# forward pass -outputs = model(**inputs) - -# verify the logits -assert outputs.logits.shape == [1, 1000] -tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From f8129a118ff2fe8a6e46f3cd3e56182ad718deb4 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:18:41 +0530 Subject: [PATCH 47/65] rebasing --- playground.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 playground.py diff --git a/playground.py b/playground.py new file mode 100644 index 0000000000000..8a53d5babd2be --- /dev/null +++ b/playground.py @@ -0,0 +1,38 @@ +import tensorflow as tf +from transformers import AutoFeatureExtractor + +# import your TFConvNextForImageClassification class here, we will take care +# of adding the boilerplate to run `from transformers import +# TFConvNextForImageClassification` later +from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification +from transformers import ConvNextForImageClassification + +from PIL import Image + +# model = ConvNextForImageClassification.from_pretrained( +# "facebook/convnext-tiny-224", +# ) +# print(f"Model State Dict:\n") +# all_keys = list(model.state_dict().keys()) +# print([k for k in all_keys if "layer_scale" in k]) + +model = TFConvNextForImageClassification.from_pretrained( + "facebook/convnext-tiny-224", + from_pt=True, +) # notice the `from_pt` argument +print(model.summary(expand_nested=True)) + + +feature_extractor = AutoFeatureExtractor.from_pretrained( + "facebook/convnext-tiny-224" +) # don't know if this is supposed to work with TF as well, change this as needed + +image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path +inputs = feature_extractor(images=image, return_tensors="tf") + +# forward pass +outputs = model(**inputs) + +# verify the logits +assert outputs.logits.shape == [1, 1000] +tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From dab6866746a923e87fe3bf8d0b19b10ad28425ed Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 13 Feb 2022 17:19:23 +0530 Subject: [PATCH 48/65] rebasing and removing playground.py. --- playground.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 playground.py diff --git a/playground.py b/playground.py deleted file mode 100644 index 8a53d5babd2be..0000000000000 --- a/playground.py +++ /dev/null @@ -1,38 +0,0 @@ -import tensorflow as tf -from transformers import AutoFeatureExtractor - -# import your TFConvNextForImageClassification class here, we will take care -# of adding the boilerplate to run `from transformers import -# TFConvNextForImageClassification` later -from src.transformers.models.convnext.modeling_tf_convnext import TFConvNextForImageClassification -from transformers import ConvNextForImageClassification - -from PIL import Image - -# model = ConvNextForImageClassification.from_pretrained( -# "facebook/convnext-tiny-224", -# ) -# print(f"Model State Dict:\n") -# all_keys = list(model.state_dict().keys()) -# print([k for k in all_keys if "layer_scale" in k]) - -model = TFConvNextForImageClassification.from_pretrained( - "facebook/convnext-tiny-224", - from_pt=True, -) # notice the `from_pt` argument -print(model.summary(expand_nested=True)) - - -feature_extractor = AutoFeatureExtractor.from_pretrained( - "facebook/convnext-tiny-224" -) # don't know if this is supposed to work with TF as well, change this as needed - -image = Image.open("tests/fixtures/tests_samples/COCO/000000039769.png") # you might need to change the relative path -inputs = feature_extractor(images=image, return_tensors="tf") - -# forward pass -outputs = model(**inputs) - -# verify the logits -assert outputs.logits.shape == [1, 1000] -tf.debugging.assert_near(outputs.logits[0, :3], [-0.0260, -0.4739, 0.1911], atol=1e-4) From 69b541393442f188c6fc2f9026278e1a812b9ff2 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 24 Feb 2022 12:30:18 +0530 Subject: [PATCH 49/65] chore: moved convnext test to the correct location --- tests/{ => convnext}/test_modeling_tf_convnext.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{ => convnext}/test_modeling_tf_convnext.py (100%) diff --git a/tests/test_modeling_tf_convnext.py b/tests/convnext/test_modeling_tf_convnext.py similarity index 100% rename from tests/test_modeling_tf_convnext.py rename to tests/convnext/test_modeling_tf_convnext.py From 15c6814e322420e7b36c35ca346f53f3c4e8b44e Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 24 Feb 2022 12:38:28 +0530 Subject: [PATCH 50/65] fix: locations for the test file of convnext. --- .../models/convnext/modeling_tf_convnext.py | 2205 ++++++++++++----- tests/convnext/test_modeling_tf_convnext.py | 45 + tests/test_modeling_tf_common.py | 419 +++- 3 files changed, 2031 insertions(+), 638 deletions(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 328194dddbc2c..2038f29e56cf8 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2022 Meta Platforms Inc. and The HuggingFace Inc. team. All rights reserved. +# Copyright 2019 HuggingFace Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,609 +12,1710 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" TF 2.0 ConvNext model.""" -from typing import Dict, Optional, Tuple, Union - -import numpy as np -import tensorflow as tf - -from ...activations_tf import get_tf_activation -from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings -from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling, TFSequenceClassifierOutput -from ...modeling_tf_utils import ( - TFModelInputType, - TFPreTrainedModel, - TFSequenceClassificationLoss, - get_initializer, - input_processing, - keras_serializable, +import copy +import inspect +import json +import os +import random +import tempfile +import unittest +from importlib import import_module +from typing import List, Tuple + +from huggingface_hub import delete_repo, login +from requests.exceptions import HTTPError +from transformers import is_tf_available +from transformers.models.auto import get_values +from transformers.testing_utils import tooslow # noqa: F401 +from transformers.testing_utils import ( + PASS, + USER, + CaptureLogger, + _tf_gpu_memory_limit, + is_pt_tf_cross_test, + is_staging_test, + require_tf, + require_tf2onnx, + slow, ) -from ...utils import logging -from .configuration_convnext import ConvNextConfig - +from transformers.utils import logging + + +if is_tf_available(): + import numpy as np + import tensorflow as tf + + from transformers import ( + TF_MODEL_FOR_CAUSAL_LM_MAPPING, + TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, + TF_MODEL_FOR_MASKED_LM_MAPPING, + TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING, + TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, + TF_MODEL_FOR_PRETRAINING_MAPPING, + TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, + TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, + TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, + TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + BertConfig, + TFAutoModel, + TFAutoModelForSequenceClassification, + TFBertModel, + TFSharedEmbeddings, + tf_top_k_top_p_filtering, + ) + from transformers.generation_tf_utils import ( + TFBeamSampleDecoderOnlyOutput, + TFBeamSampleEncoderDecoderOutput, + TFBeamSearchDecoderOnlyOutput, + TFBeamSearchEncoderDecoderOutput, + TFGreedySearchDecoderOnlyOutput, + TFGreedySearchEncoderDecoderOutput, + TFSampleDecoderOnlyOutput, + TFSampleEncoderDecoderOutput, + ) -logger = logging.get_logger(__name__) + if _tf_gpu_memory_limit is not None: + gpus = tf.config.list_physical_devices("GPU") + for gpu in gpus: + # Restrict TensorFlow to only allocate x GB of memory on the GPUs + try: + tf.config.set_logical_device_configuration( + gpu, + [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)], + ) + logical_gpus = tf.config.list_logical_devices("GPU") + print("Logical GPUs", logical_gpus) + except RuntimeError as e: + # Virtual devices must be set before GPUs have been initialized + print(e) + + +def _config_zero_init(config): + configs_no_init = copy.deepcopy(config) + for key in configs_no_init.__dict__.keys(): + if "_range" in key or "_std" in key: + setattr(configs_no_init, key, 0.0) + return configs_no_init + + +@require_tf +class TFModelTesterMixin: + + model_tester = None + all_model_classes = () + all_generative_model_classes = () + test_mismatched_shapes = True + test_resize_embeddings = True + test_head_masking = True + is_encoder_decoder = False + + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict: + inputs_dict = copy.deepcopy(inputs_dict) + + if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): + inputs_dict = { + k: tf.tile( + tf.expand_dims(v, 1), + (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1), + ) + if isinstance(v, tf.Tensor) and v.ndim > 0 + else v + for k, v in inputs_dict.items() + } + if return_labels: + if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): + inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32) + elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING): + inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) + inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) + elif model_class in [ + *get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING), + *get_values(TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING), + ]: + inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) + elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING): + inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) + elif model_class in [ + *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING), + *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING), + *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING), + *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING), + *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING), + *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING), + ]: + inputs_dict["labels"] = tf.zeros( + ( + self.model_tester.batch_size, + self.model_tester.seq_length, + ), + dtype=tf.int32, + ) + return inputs_dict + + def test_initialization(self): + pass + + def test_save_load(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname, saved_model=False) + model = model_class.from_pretrained(tmpdirname) + after_outputs = model(self._prepare_for_class(inputs_dict, model_class)) + + self.assert_outputs_same(after_outputs, outputs) + + def test_save_load_config(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + model_config = model.get_config() + # make sure that returned config is jsonifiable, which is required by keras + json.dumps(model_config) + new_model = model_class.from_config(model.get_config()) + # make sure it also accepts a normal config + _ = model_class.from_config(model.config) + _ = new_model(self._prepare_for_class(inputs_dict, model_class)) # Build model + new_model.set_weights(model.get_weights()) + after_outputs = new_model(self._prepare_for_class(inputs_dict, model_class)) + + self.assert_outputs_same(after_outputs, outputs) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + if model.config.is_encoder_decoder: + expected_arg_names = [ + "input_ids", + "attention_mask", + "decoder_input_ids", + "decoder_attention_mask", + ] + expected_arg_names.extend( + ["head_mask", "decoder_head_mask"] if "head_mask" and "decoder_head_mask" in arg_names else [] + ) + # Necessary to handle BART with newly added cross_attn_head_mask + expected_arg_names.extend( + ["cross_attn_head_mask", "encoder_outputs"] + if "cross_attn_head_mask" in arg_names + else ["encoder_outputs"] + ) + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + + else: + expected_arg_names = ["input_ids"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_onnx_compliancy(self): + if not self.test_onnx: + return + + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + INTERNAL_OPS = [ + "Assert", + "AssignVariableOp", + "EmptyTensorList", + "ReadVariableOp", + "ResourceGather", + "TruncatedNormal", + "VarHandleOp", + "VarIsInitializedOp", + ] + onnx_ops = [] -_CONFIG_FOR_DOC = "ConvNextConfig" -_CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224" + with open(os.path.join(".", "utils", "tf_ops", "onnx.json")) as f: + onnx_opsets = json.load(f)["opsets"] + for i in range(1, self.onnx_min_opset + 1): + onnx_ops.extend(onnx_opsets[str(i)]) -class TFConvNextDropPath(tf.keras.layers.Layer): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - References: - (1) github.com:rwightman/pytorch-image-models - """ + for model_class in self.all_model_classes: + model_op_names = set() - def __init__(self, drop_path, **kwargs): - super().__init__(**kwargs) - self.drop_path = drop_path + with tf.Graph().as_default() as g: + model = model_class(config) + model(model.dummy_inputs) - def call(self, x, training=None): - if training: - keep_prob = 1 - self.drop_path - shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) - random_tensor = keep_prob + tf.random.uniform(shape, 0, 1) - random_tensor = tf.floor(random_tensor) - return (x / keep_prob) * random_tensor - return x + for op in g.get_operations(): + model_op_names.add(op.node_def.op) + model_op_names = sorted(model_op_names) + incompatible_ops = [] -class TFConvNextEmbeddings(tf.keras.layers.Layer): - """This class is comparable to (and inspired by) the SwinEmbeddings class - found in src/transformers/models/swin/modeling_swin.py. - """ + for op in model_op_names: + if op not in onnx_ops and op not in INTERNAL_OPS: + incompatible_ops.append(op) - def __init__(self, config, **kwargs): - super().__init__(**kwargs) - self.patch_embeddings = tf.keras.layers.Conv2D( - filters=config.hidden_sizes[0], - kernel_size=config.patch_size, - strides=config.patch_size, - name="patch_embeddings", - kernel_initializer=get_initializer(config.initializer_range), - bias_initializer="zeros", - ) - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") - - def call(self, pixel_values): - if isinstance(pixel_values, dict): - pixel_values = pixel_values["pixel_values"] - - # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. - # So change the input format from `NCHW` to `NHWC`. - # shape = (batch_size, in_height, in_width, in_channels=num_channels) - pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) - - embeddings = self.patch_embeddings(pixel_values) - embeddings = self.layernorm(embeddings) - return embeddings - - -class TFConvNextLayer(tf.keras.layers.Layer): - """This corresponds to the `Block` class in the original implementation. - - There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C, - H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back - - The authors used (2) as they find it slightly faster in PyTorch. Since we already permuted the inputs to follow - NHWC ordering, we can just apply the operations straight-away without the permutation. - - Args: - config ([`ConvNextConfig`]): Model configuration class. - dim (`int`): Number of input channels. - drop_path (`float`): Stochastic depth rate. Default: 0.0. - """ - - def __init__(self, config, dim, drop_path=0.0, **kwargs): - super().__init__(**kwargs) - self.dim = dim - self.config = config - self.dwconv = tf.keras.layers.Conv2D( - filters=dim, - kernel_size=7, - padding="same", - groups=dim, - kernel_initializer=get_initializer(config.initializer_range), - bias_initializer="zeros", - name="dwconv", - ) # depthwise conv - self.layernorm = tf.keras.layers.LayerNormalization( - epsilon=1e-6, - name="layernorm", - ) - self.pwconv1 = tf.keras.layers.Dense( - units=4 * dim, - kernel_initializer=get_initializer(config.initializer_range), - bias_initializer="zeros", - name="pwconv1", - ) # pointwise/1x1 convs, implemented with linear layers - self.act = get_tf_activation(config.hidden_act) - self.pwconv2 = tf.keras.layers.Dense( - units=dim, - kernel_initializer=get_initializer(config.initializer_range), - bias_initializer="zeros", - name="pwconv2", - ) - # Using `layers.Activation` instead of `tf.identity` to better control `training` - # behaviour. - self.drop_path = ( - TFConvNextDropPath( - drop_path, - name="drop_path", - ) - if drop_path > 0.0 - else tf.keras.layers.Activation( - "linear", - name="drop_path", - ) - ) + self.assertEqual(len(incompatible_ops), 0, incompatible_ops) - def build(self, input_shape: tf.TensorShape): - # PT's `nn.Parameters` must be mapped to a TF layer weight to inherit the same name hierarchy (and vice-versa) - self.layer_scale_parameter = ( - self.add_weight( - shape=(self.dim,), - initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value), - trainable=True, - name="layer_scale_parameter", - ) - if self.config.layer_scale_init_value > 0 - else None - ) - super().build(input_shape) - - def call(self, hidden_states, training=False): - input = hidden_states - x = self.dwconv(hidden_states) - x = self.layernorm(x) - x = self.pwconv1(x) - x = self.act(x) - x = self.pwconv2(x) - - if self.layer_scale_parameter is not None: - x = self.layer_scale_parameter * x - - x = input + self.drop_path(x, training=training) - return x - - -class TFConvNextStage(tf.keras.layers.Layer): - """ConvNext stage, consisting of an optional downsampling layer + multiple residual blocks. - - Args: - config ([`ConvNextConfig`]): Model configuration class. - in_channels (`int`): Number of input channels. - out_channels (`int`): Number of output channels. - depth (`int`): Number of residual blocks. - drop_path_rates(`List[float]`): Stochastic depth rates for each layer. - """ - - def __init__( - self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs - ): - super().__init__(**kwargs) - if in_channels != out_channels or stride > 1: - self.downsampling_layer = [ - tf.keras.layers.LayerNormalization( - epsilon=1e-6, - name="downsampling_layer.0", - ), - # Inputs to this layer will follow NHWC format since we - # transposed the inputs from NCHW to NHWC in the `TFConvNextEmbeddings` - # layer. All the outputs throughout the model will be in NHWC - # from this point on until the output where we again change to - # NCHW. - tf.keras.layers.Conv2D( - filters=out_channels, - kernel_size=kernel_size, - strides=stride, - kernel_initializer=get_initializer(config.initializer_range), - bias_initializer="zeros", - name="downsampling_layer.1", - ), - ] - else: - self.downsampling_layer = [tf.identity] - - drop_path_rates = drop_path_rates or [0.0] * depth - self.layers = [ - TFConvNextLayer( - config, - dim=out_channels, - drop_path=drop_path_rates[j], - name=f"layers.{j}", - ) - for j in range(depth) - ] + @require_tf2onnx + @slow + def test_onnx_runtime_optimize(self): + if not self.test_onnx: + return - def call(self, hidden_states): - for layer in self.downsampling_layer: - hidden_states = layer(hidden_states) - for layer in self.layers: - hidden_states = layer(hidden_states) - return hidden_states - - -class TFConvNextEncoder(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): - super().__init__(**kwargs) - self.stages = [] - drop_path_rates = [x for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))] - cur = 0 - prev_chs = config.hidden_sizes[0] - for i in range(config.num_stages): - out_chs = config.hidden_sizes[i] - stage = TFConvNextStage( - config, - in_channels=prev_chs, - out_channels=out_chs, - stride=2 if i > 0 else 1, - depth=config.depths[i], - drop_path_rates=drop_path_rates[cur], - name=f"stages.{i}", - ) - self.stages.append(stage) - cur += config.depths[i] - prev_chs = out_chs + import onnxruntime + import tf2onnx - def call(self, hidden_states, output_hidden_states=False, return_dict=True): - all_hidden_states = () if output_hidden_states else None + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() - for i, layer_module in enumerate(self.stages): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) + for model_class in self.all_model_classes: + model = model_class(config) + model(model.dummy_inputs) - hidden_states = layer_module(hidden_states) + onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset) - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) + onnxruntime.InferenceSession(onnx_model_proto.SerializeToString()) - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) + def test_keras_save_load(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() - return TFBaseModelOutput( - last_hidden_state=hidden_states, - hidden_states=all_hidden_states, + tf_main_layer_classes = set( + module_member + for model_class in self.all_model_classes + for module in (import_module(model_class.__module__),) + for module_member_name in dir(module) + if module_member_name.endswith("MainLayer") + # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`. + and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] + for module_member in (getattr(module, module_member_name),) + if isinstance(module_member, type) + and tf.keras.layers.Layer in module_member.__bases__ + and getattr(module_member, "_keras_serializable", False) ) + for main_layer_class in tf_main_layer_classes: + # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter + if "T5" in main_layer_class.__name__: + # Take the same values than in TFT5ModelTester for this shared layer + shared = TFSharedEmbeddings(99, 32, name="shared") + config.use_cache = inputs_dict.pop("use_cache", None) + main_layer = main_layer_class(config, embed_tokens=shared) + else: + main_layer = main_layer_class(config) + + symbolic_inputs = { + name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() + } + model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) + outputs = model(inputs_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + filepath = os.path.join(tmpdirname, "keras_model.h5") + model.save(filepath) + if "T5" in main_layer_class.__name__: + model = tf.keras.models.load_model( + filepath, + custom_objects={ + main_layer_class.__name__: main_layer_class, + "TFSharedEmbeddings": TFSharedEmbeddings, + }, + ) + else: + model = tf.keras.models.load_model( + filepath, + custom_objects={main_layer_class.__name__: main_layer_class}, + ) + assert isinstance(model, tf.keras.Model) + after_outputs = model(inputs_dict) + self.assert_outputs_same(after_outputs, outputs) + + def assert_outputs_same(self, after_outputs, outputs): + # Make sure we don't have nans + if isinstance(after_outputs, tf.Tensor): + out_1 = after_outputs.numpy() + elif isinstance(after_outputs, dict): + out_1 = after_outputs[list(after_outputs.keys())[0]].numpy() + else: + out_1 = after_outputs[0].numpy() + out_2 = outputs[0].numpy() + self.assertEqual(out_1.shape, out_2.shape) + out_1 = out_1[~np.isnan(out_1)] + out_2 = out_2[~np.isnan(out_2)] + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + @is_pt_tf_cross_test + def test_pt_tf_model_equivalence(self): + import torch + + import transformers + + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beginning + pt_model_class = getattr(transformers, pt_model_class_name) + + config.output_hidden_states = True + + tf_model = model_class(config) + pt_model = pt_model_class(config) + + # Check we can load pt model in tf and vice-versa with model => model functions + tf_model = transformers.load_pytorch_model_in_tf2_model( + tf_model, + pt_model, + tf_inputs=self._prepare_for_class(inputs_dict, model_class), + ) + pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) + + # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences + pt_model.eval() + pt_inputs_dict = {} + for name, key in self._prepare_for_class(inputs_dict, model_class).items(): + if type(key) == bool: + pt_inputs_dict[name] = key + elif name == "input_values": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) + elif name == "pixel_values": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) + elif name == "input_features": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) + else: + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) + + with torch.no_grad(): + pto = pt_model(**pt_inputs_dict) + tfo = tf_model( + self._prepare_for_class(inputs_dict, model_class), + training=False, + ) -@keras_serializable -class TFConvNextMainLayer(tf.keras.layers.Layer): - config_class = ConvNextConfig - - def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs): - super().__init__(**kwargs) - - self.config = config - self.embeddings = TFConvNextEmbeddings(config, name="embeddings") - self.encoder = TFConvNextEncoder(config, name="encoder") - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") - self.pooler = tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None - - def call( - self, - pixel_values: Optional[TFModelInputType] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - training: bool = False, - **kwargs, - ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + tf_hidden_states = tfo[0].numpy() + pt_hidden_states = pto[0].numpy() + + tf_nans = np.copy(np.isnan(tf_hidden_states)) + pt_nans = np.copy(np.isnan(pt_hidden_states)) + + pt_hidden_states[tf_nans] = 0 + tf_hidden_states[tf_nans] = 0 + pt_hidden_states[pt_nans] = 0 + tf_hidden_states[pt_nans] = 0 + + max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states)) + self.assertLessEqual(max_diff, 4e-2) + + # Check we can load pt model in tf and vice-versa with checkpoint => model functions + with tempfile.TemporaryDirectory() as tmpdirname: + pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") + torch.save(pt_model.state_dict(), pt_checkpoint_path) + tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) + + tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") + tf_model.save_weights(tf_checkpoint_path) + pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) + + # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences + pt_model.eval() + pt_inputs_dict = {} + for name, key in self._prepare_for_class(inputs_dict, model_class).items(): + if type(key) == bool: + key = np.array(key, dtype=bool) + pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long) + elif name == "input_values": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) + elif name == "pixel_values": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) + elif name == "input_features": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) + else: + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) + + with torch.no_grad(): + pto = pt_model(**pt_inputs_dict) + tfo = tf_model(self._prepare_for_class(inputs_dict, model_class)) + tfo = tfo[0].numpy() + pto = pto[0].numpy() + tf_nans = np.copy(np.isnan(tfo)) + pt_nans = np.copy(np.isnan(pto)) + + pto[tf_nans] = 0 + tfo[tf_nans] = 0 + pto[pt_nans] = 0 + tfo[pt_nans] = 0 + + max_diff = np.amax(np.abs(tfo - pto)) + self.assertLessEqual(max_diff, 4e-2) + + def test_compile_tf_model(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + max_input = getattr(self.model_tester, "max_position_embeddings", 512) + optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") + + for model_class in self.all_model_classes: + if model_class.__name__ in [ + "TFSpeech2TextModel", + "TFSpeech2TextForConditionalGeneration", + ]: + inputs = { + "decoder_input_ids": tf.keras.Input( + batch_shape=(2, max_input), + name="decoder_input_ids", + dtype="int32", + ), + "input_features": tf.keras.Input( + batch_shape=( + 2, + max_input, + self.model_tester.input_feat_per_channel * self.model_tester.input_channels, + ), + name="input_features", + dtype="float32", + ), + } + elif self.is_encoder_decoder: + inputs = { + "decoder_input_ids": tf.keras.Input( + batch_shape=(2, max_input), + name="decoder_input_ids", + dtype="int32", + ), + "input_ids": tf.keras.Input( + batch_shape=(2, max_input), + name="input_ids", + dtype="int32", + ), + } + # `pixel_values` implies that the input is an image + elif model_class.main_input_name == "pixel_values": + inputs = tf.keras.Input( + batch_shape=( + 3, + self.model_tester.num_channels, + self.model_tester.image_size, + self.model_tester.image_size, + ), + name="pixel_values", + dtype="float32", + ) + elif model_class.__name__ in ["TFCLIPModel"]: + inputs = { + "input_ids": tf.keras.Input( + batch_shape=(3, max_input), + name="input_ids", + dtype="int32", + ), + "pixel_values": tf.keras.Input( + batch_shape=( + 3, + self.model_tester.vision_model_tester.num_channels, + self.model_tester.vision_model_tester.image_size, + self.model_tester.vision_model_tester.image_size, + ), + name="pixel_values", + dtype="float32", + ), + } + elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): + inputs = tf.keras.Input( + batch_shape=(4, 2, max_input), + name="input_ids", + dtype="int32", + ) + else: + inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32") + + # Prepare our model + model = model_class(config) + model(self._prepare_for_class(inputs_dict, model_class)) # Model must be called before saving. + # Let's load it from the disk to be sure we can use pretrained weights + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname, saved_model=False) + model = model_class.from_pretrained(tmpdirname) + + outputs_dict = model(inputs) + hidden_states = outputs_dict[0] + + # Add a dense layer on top to test integration with other keras modules + outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states) + + # Compile extended model + extended_model = tf.keras.Model(inputs=[inputs], outputs=[outputs]) + extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) + + def test_keyword_and_dict_args(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs_dict = model(inputs) + + inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + outputs_keywords = model(**inputs_keywords) + output_dict = outputs_dict[0].numpy() + output_keywords = outputs_keywords[0].numpy() + + self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6) + + def test_attention_outputs(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + decoder_seq_length = getattr( + self.model_tester, + "decoder_seq_length", + self.model_tester.seq_length, ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - inputs = input_processing( - func=self.call, - config=self.config, - input_ids=pixel_values, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - training=training, - kwargs_call=kwargs, + encoder_seq_length = getattr( + self.model_tester, + "encoder_seq_length", + self.model_tester.seq_length, ) + decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length) + encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) + + def check_decoder_attentions_output(outputs): + out_len = len(outputs) + self.assertEqual(min(out_len % 2, out_len % 5), 0) # differentiation due to newly added cross_attentions + decoder_attentions = outputs.decoder_attentions + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + decoder_key_length, + ], + ) - if "input_ids" in inputs: - inputs["pixel_values"] = inputs.pop("input_ids") + def check_encoder_attentions_output(outputs): + attentions = [ + t.numpy() for t in (outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions) + ] + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + encoder_seq_length, + encoder_key_length, + ], + ) - if inputs["pixel_values"] is None: - raise ValueError("You have to specify pixel_values") + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["use_cache"] = False + config.output_hidden_states = False + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + out_len = len(outputs) + self.assertEqual(config.output_hidden_states, False) + check_encoder_attentions_output(outputs) + + if self.is_encoder_decoder: + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + self.assertEqual(config.output_hidden_states, False) + check_decoder_attentions_output(outputs) + + # Check that output attentions can also be changed via the config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + self.assertEqual(config.output_hidden_states, False) + check_encoder_attentions_output(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + config.output_hidden_states = True + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs)) + self.assertEqual(model.config.output_hidden_states, True) + check_encoder_attentions_output(outputs) + + def test_headmasking(self): + if not self.test_head_masking: + return + + random.Random().seed(42) + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + random.Random().seed() + + inputs_dict["output_attentions"] = True + config.output_hidden_states = True + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + + # Prepare head_mask + def prepare_layer_head_mask(i, attention_heads, num_hidden_layers): + if i == 0: + return tf.concat( + ( + tf.zeros(1, dtype=tf.float32), + tf.ones(attention_heads - 1, dtype=tf.float32), + ), + 0, + ) + elif i == num_hidden_layers - 1: + return tf.concat( + ( + tf.zeros(attention_heads - 1, dtype=tf.float32), + tf.ones(1, dtype=tf.float32), + ), + 0, + ) + else: + return tf.ones(attention_heads, dtype=tf.float32) + + head_mask = tf.stack( + [ + prepare_layer_head_mask(i, config.num_attention_heads, config.num_hidden_layers) + for i in range(config.num_hidden_layers) + ], + 0, + ) - embedding_output = self.embeddings(inputs["pixel_values"], training=inputs["training"]) + inputs = self._prepare_for_class(inputs_dict, model_class).copy() + inputs["head_mask"] = head_mask + if model.config.is_encoder_decoder: + signature = inspect.signature(model.call) + arg_names = [*signature.parameters.keys()] + if "decoder_head_mask" in arg_names: # necessary diferentiation because of T5 model + inputs["decoder_head_mask"] = head_mask + if "cross_attn_head_mask" in arg_names: + inputs["cross_attn_head_mask"] = head_mask + + outputs = model(**inputs, return_dict=True) + + def check_attentions_validity(attentions): + # Remove Nan + for t in attentions: + self.assertLess( + (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), + (tf.size(t) / 4).numpy(), + ) # Check we don't have more than 25% nans (arbitrary) + + attentions = [ + tf.where(tf.math.is_nan(t), 0.0, t) for t in attentions + ] # remove them (the test is less complete) + + self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0) + self.assertNotEqual( + tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), + 0.0, + ) + if len(attentions) > 2: # encoder-decodere models have only 2 layers in each modules + self.assertNotEqual( + tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), + 0.0, + ) + self.assertAlmostEqual( + tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), + 0.0, + ) + self.assertNotEqual( + tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), + 0.0, + ) + + if model.config.is_encoder_decoder: + check_attentions_validity(outputs.encoder_attentions) + check_attentions_validity(outputs.decoder_attentions) + if "cross_attn_head_mask" in arg_names: + check_attentions_validity(outputs.cross_attentions) + else: + check_attentions_validity(outputs.attentions) + + def test_hidden_states_output(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + def check_hidden_states_output(config, inputs_dict, model_class): + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + expected_num_layers = getattr( + self.model_tester, + "expected_num_hidden_layers", + self.model_tester.num_hidden_layers + 1, + ) - encoder_outputs = self.encoder( - embedding_output, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - training=inputs["training"], + if model.config.is_encoder_decoder: + encoder_hidden_states = outputs.encoder_hidden_states + decoder_hidden_states = outputs.decoder_hidden_states + + self.assertEqual(config.output_attentions, False) + self.assertEqual(len(encoder_hidden_states), expected_num_layers) + self.assertListEqual( + list(encoder_hidden_states[0].shape[-2:]), + [ + self.model_tester.seq_length, + self.model_tester.hidden_size, + ], + ) + self.assertEqual(len(decoder_hidden_states), expected_num_layers) + self.assertListEqual( + list(decoder_hidden_states[0].shape[-2:]), + [ + self.model_tester.seq_length, + self.model_tester.hidden_size, + ], + ) + else: + hidden_states = outputs.hidden_states + self.assertEqual(config.output_attentions, False) + self.assertEqual(len(hidden_states), expected_num_layers) + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [ + self.model_tester.seq_length, + self.model_tester.hidden_size, + ], + ) + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(config, inputs_dict, model_class) + + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + check_hidden_states_output(config, inputs_dict, model_class) + + def test_model_common_attributes(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + text_in_text_out_models = ( + get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING) + + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING) + + get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING) ) + speech_in_text_out_models = get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING) + + for model_class in self.all_model_classes: + model = model_class(config) + assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer) + if model_class in text_in_text_out_models: + x = model.get_output_embeddings() + assert isinstance(x, tf.keras.layers.Layer) + name = model.get_bias() + assert isinstance(name, dict) + for k, v in name.items(): + assert isinstance(v, tf.Variable) + elif model_class in speech_in_text_out_models: + x = model.get_output_embeddings() + assert isinstance(x, tf.keras.layers.Layer) + name = model.get_bias() + assert name is None + else: + x = model.get_output_embeddings() + assert x is None + name = model.get_bias() + assert name is None + + def test_determinism(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + first, second = ( + model( + self._prepare_for_class(inputs_dict, model_class), + training=False, + )[0], + model( + self._prepare_for_class(inputs_dict, model_class), + training=False, + )[0], + ) + out_1 = first.numpy() + out_2 = second.numpy() + out_1 = out_1[~np.isnan(out_1)] + out_2 = out_2[~np.isnan(out_2)] + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + def test_model_outputs_equivalence(self): + + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (List, Tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + all(tf.equal(tuple_object, dict_object)), + msg=f"Tuple and dict output are not equal. Difference: {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}", + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + model = model_class(config) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence( + model, + tuple_inputs, + dict_inputs, + {"output_hidden_states": True, "output_attentions": True}, + ) - last_hidden_state = encoder_outputs[0] - pooled_output = self.layernorm(self.pooler(last_hidden_state)) - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return TFBaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - ) + def test_inputs_embeds(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + + inputs = copy.deepcopy(inputs_dict) + + if not self.is_encoder_decoder: + input_ids = inputs["input_ids"] + del inputs["input_ids"] + else: + encoder_input_ids = inputs["input_ids"] + decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids) + del inputs["input_ids"] + inputs.pop("decoder_input_ids", None) + + if not self.is_encoder_decoder: + inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids) + else: + inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids) + inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids) + + inputs = self._prepare_for_class(inputs, model_class) + + model(inputs) + + def test_numpy_arrays_inputs(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + def prepare_numpy_arrays(inputs_dict): + inputs_np_dict = {} + for k, v in inputs_dict.items(): + if tf.is_tensor(v): + inputs_np_dict[k] = v.numpy() + else: + inputs_np_dict[k] = np.array(k) + + return inputs_np_dict + + for model_class in self.all_model_classes: + model = model_class(config) + + inputs = self._prepare_for_class(inputs_dict, model_class) + inputs_np = prepare_numpy_arrays(inputs) + + output_for_dict_input = model(inputs_np) + output_for_kw_input = model(**inputs_np) + self.assert_outputs_same(output_for_dict_input, output_for_kw_input) + + def test_resize_token_embeddings(self): + if not self.test_resize_embeddings: + return + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + def _get_word_embedding_weight(model, embedding_layer): + embeds = getattr(embedding_layer, "weight", None) + if embeds is not None: + return embeds + + embeds = getattr(embedding_layer, "decoder", None) + if embeds is not None: + return embeds + + model(model.dummy_inputs) + + embeds = getattr(embedding_layer, "weight", None) + if embeds is not None: + return embeds + + embeds = getattr(embedding_layer, "decoder", None) + if embeds is not None: + return embeds + + return None + + for model_class in self.all_model_classes: + for size in [config.vocab_size - 10, config.vocab_size + 10, None]: + # build the embeddings + model = model_class(config=config) + old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings()) + old_bias = model.get_bias() + old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings()) + # reshape the embeddings + model.resize_token_embeddings(size) + new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings()) + new_bias = model.get_bias() + new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings()) + + # check that the resized embeddings size matches the desired size. + assert_size = size if size is not None else config.vocab_size + self.assertEqual(new_input_embeddings.shape[0], assert_size) + + # check that weights remain the same after resizing + models_equal = True + for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()): + if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: + models_equal = False + self.assertTrue(models_equal) + + if old_bias is not None and new_bias is not None: + for old_weight, new_weight in zip(old_bias.values(), new_bias.values()): + self.assertEqual(new_weight.shape[0], assert_size) + + models_equal = True + for p1, p2 in zip(old_weight.value(), new_weight.value()): + if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: + models_equal = False + self.assertTrue(models_equal) + + if old_output_embeddings is not None and new_output_embeddings is not None: + self.assertEqual(new_output_embeddings.shape[0], assert_size) + self.assertEqual( + new_output_embeddings.shape[1], + old_output_embeddings.shape[1], + ) + + models_equal = True + for p1, p2 in zip( + old_output_embeddings.value(), + new_output_embeddings.value(), + ): + if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: + models_equal = False + self.assertTrue(models_equal) + + def test_lm_head_model_random_no_beam_search_generate(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + input_ids = inputs_dict.get("input_ids", None) + + # iterate over all generative models + for model_class in self.all_generative_model_classes: + model = model_class(config) + + if config.bos_token_id is None: + # if bos token id is not defined model needs input_ids + with self.assertRaises(AssertionError): + model.generate(do_sample=True, max_length=5) + # num_return_sequences = 1 + self._check_generated_ids(model.generate(input_ids, do_sample=True)) + elif model_class.__name__ not in ["TFSpeech2TextForConditionalGeneration"]: + # Models with non-text inputs won't work here; num_return_sequences = 1 + self._check_generated_ids(model.generate(do_sample=True, max_length=5)) + + with self.assertRaises(ValueError): + # generating multiple sequences when no beam search generation + # is not allowed as it would always generate the same sequences + model.generate(input_ids, do_sample=False, num_return_sequences=2) + + # num_return_sequences > 1, sample + self._check_generated_ids(model.generate(input_ids, do_sample=True, num_return_sequences=2)) + + # check bad words tokens language generation + # create list of 1-seq bad token and list of 2-seq of bad tokens + bad_words_ids = [ + self._generate_random_bad_tokens(1, model), + self._generate_random_bad_tokens(2, model), + ] + output_tokens = model.generate( + input_ids, + do_sample=True, + bad_words_ids=bad_words_ids, + num_return_sequences=2, + ) + # only count generated tokens + generated_ids = output_tokens[:, input_ids.shape[-1] :] + self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) + + def test_lm_head_model_no_beam_search_generate_dict_outputs(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + input_ids = inputs_dict.get("input_ids", None) + if input_ids is None: + input_ids = inputs_dict.get("input_features", None) + + # iterate over all generative models + for model_class in self.all_generative_model_classes: + model = model_class(config) + output_greedy = model.generate( + input_ids, + do_sample=False, + output_scores=True, + output_hidden_states=True, + output_attentions=True, + return_dict_in_generate=True, + ) + output_sample = model.generate( + input_ids, + do_sample=True, + output_scores=True, + output_hidden_states=True, + output_attentions=True, + return_dict_in_generate=True, + ) + if model.config.is_encoder_decoder: + self.assertIsInstance(output_greedy, TFGreedySearchEncoderDecoderOutput) + self.assertIsInstance(output_sample, TFSampleEncoderDecoderOutput) + else: + self.assertIsInstance(output_greedy, TFGreedySearchDecoderOnlyOutput) + self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput) + + def test_lm_head_model_random_beam_search_generate(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + input_ids = inputs_dict.get("input_ids", None) + + for model_class in self.all_generative_model_classes: + model = model_class(config) + + if config.bos_token_id is None: + # if bos token id is not defined model needs input_ids, num_return_sequences = 1 + self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2)) + else: + # num_return_sequences = 1 + self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2)) + + with self.assertRaises(AssertionError): + # generating more sequences than having beams leads is not possible + model.generate( + input_ids, + do_sample=False, + num_return_sequences=3, + num_beams=2, + ) + + # num_return_sequences > 1, sample + self._check_generated_ids( + model.generate( + input_ids, + do_sample=True, + num_beams=2, + num_return_sequences=2, + ) + ) + # num_return_sequences > 1, greedy + self._check_generated_ids( + model.generate( + input_ids, + do_sample=False, + num_beams=2, + num_return_sequences=2, + ) + ) -class TFConvNextPreTrainedModel(TFPreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = ConvNextConfig - base_model_prefix = "convnext" - main_input_name = "pixel_values" - - @property - def dummy_inputs(self) -> Dict[str, tf.Tensor]: - """ - Dummy inputs to build the network. - - Returns: - `Dict[str, tf.Tensor]`: The dummy inputs. - """ - VISION_DUMMY_INPUTS = tf.random.uniform( - shape=( - 3, - self.config.num_channels, - self.config.image_size, - self.config.image_size, - ), - dtype=tf.float32, - ) - return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)} + # check bad words tokens language generation + # create list of 1-seq bad token and list of 2-seq of bad tokens + bad_words_ids = [ + self._generate_random_bad_tokens(1, model), + self._generate_random_bad_tokens(2, model), + ] + output_tokens = model.generate( + input_ids, + do_sample=False, + bad_words_ids=bad_words_ids, + num_beams=2, + num_return_sequences=2, + ) + # only count generated tokens + generated_ids = output_tokens[:, input_ids.shape[-1] :] + self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) + + def test_lm_head_model_beam_search_generate_dict_outputs(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + input_ids = inputs_dict.get("input_ids", None) + if input_ids is None: + input_ids = inputs_dict.get("input_features", None) + + # iterate over all generative models + for model_class in self.all_generative_model_classes: + model = model_class(config) + output_beam_search = model.generate( + input_ids, + num_beams=2, + do_sample=False, + output_scores=True, + output_hidden_states=True, + output_attentions=True, + return_dict_in_generate=True, + ) + output_beam_sample = model.generate( + input_ids, + num_beams=2, + do_sample=True, + output_scores=True, + output_hidden_states=True, + output_attentions=True, + return_dict_in_generate=True, + ) - @tf.function( - input_signature=[ - { - "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"), - } + if model.config.is_encoder_decoder: + self.assertIsInstance(output_beam_search, TFBeamSearchEncoderDecoderOutput) + self.assertIsInstance(output_beam_sample, TFBeamSampleEncoderDecoderOutput) + else: + self.assertIsInstance(output_beam_search, TFBeamSearchDecoderOnlyOutput) + self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput) + + def test_loss_computation(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config) + if getattr(model, "hf_compute_loss", None): + # The number of elements in the loss should be the same as the number of elements in the label + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + added_label = prepared_for_class[ + sorted( + list(prepared_for_class.keys() - inputs_dict.keys()), + reverse=True, + )[0] + ] + loss_size = tf.size(added_label) + + if model.__class__ in get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING): + # if loss is causal lm loss, labels are shift, so that one label per batch + # is cut + loss_size = loss_size - self.model_tester.batch_size + + # Test that model correctly compute the loss with kwargs + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + possible_input_names = { + "input_ids", + "pixel_values", + "input_features", + } + input_name = possible_input_names.intersection(set(prepared_for_class)).pop() + model_input = prepared_for_class.pop(input_name) + + loss = model(model_input, **prepared_for_class)[0] + self.assertEqual(loss.shape, [loss_size]) + + # Test that model correctly compute the loss with a dict + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + loss = model(prepared_for_class)[0] + self.assertEqual(loss.shape, [loss_size]) + + # Test that model correctly compute the loss with a tuple + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + + # Get keys that were added with the _prepare_for_class function + label_keys = prepared_for_class.keys() - inputs_dict.keys() + signature = inspect.signature(model.call).parameters + signature_names = list(signature.keys()) + + # Create a dictionary holding the location of the tensors in the tuple + tuple_index_mapping = {0: input_name} + for label_key in label_keys: + label_key_index = signature_names.index(label_key) + tuple_index_mapping[label_key_index] = label_key + sorted_tuple_index_mapping = sorted(tuple_index_mapping.items()) + # Initialize a list with their default values, update the values and convert to a tuple + list_input = [] + + for name in signature_names: + if name != "kwargs": + list_input.append(signature[name].default) + + for index, value in sorted_tuple_index_mapping: + list_input[index] = prepared_for_class[value] + + tuple_input = tuple(list_input) + + # Send to model + loss = model(tuple_input[:-1])[0] + + self.assertEqual(loss.shape, [loss_size]) + + def test_generate_with_headmasking(self): + attention_names = [ + "encoder_attentions", + "decoder_attentions", + "cross_attentions", ] - ) - def serving(self, inputs): - """ - Method used for serving the model. - - Args: - inputs (`Dict[str, tf.Tensor]`): - The input of the saved model as a dictionary of tensors. - """ - return self.call(inputs) - - -CONVNEXT_START_DOCSTRING = r""" - This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it - as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and - behavior. - - - - TF 2.0 models accepts two formats as inputs: + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_generative_model_classes: + model = model_class(config) + + # We want to test only encoder-decoder models + if not config.is_encoder_decoder: + continue + + head_masking = { + "head_mask": tf.zeros((config.encoder_layers, config.encoder_attention_heads)), + "decoder_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)), + "cross_attn_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)), + } - - having all inputs as keyword arguments (like PyTorch models), or - - having all inputs as a list, tuple or dict in the first positional arguments. + signature = inspect.signature(model.call) + if set(head_masking.keys()) < set([*signature.parameters.keys()]): + continue + + for attn_name, (name, mask) in zip(attention_names, head_masking.items()): + out = model.generate( + inputs_dict["input_ids"], + num_beams=1, + max_length=inputs_dict["input_ids"] + 5, + output_attentions=True, + return_dict_in_generate=True, + **{name: mask}, + ) + # We check the state of decoder_attentions and cross_attentions just from the last step + attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1] + self.assertEqual(sum([tf.reduce_sum(w).numpy() for w in attn_weights]), 0.0) + + def test_load_with_mismatched_shapes(self): + if not self.test_mismatched_shapes: + return + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): + continue + + with self.subTest(msg=f"Testing {model_class}"): + with tempfile.TemporaryDirectory() as tmp_dir: + model = model_class(config) + inputs = self._prepare_for_class(inputs_dict, model_class) + _ = model(**inputs) + model.save_pretrained(tmp_dir) + + # Fails when we don't set ignore_mismatched_sizes=True + with self.assertRaises(ValueError): + new_model = TFAutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42) + with self.assertRaises(ValueError): + new_model_without_prefix = TFAutoModel.from_pretrained(tmp_dir, vocab_size=10) + + logger = logging.get_logger("transformers.modeling_tf_utils") + with CaptureLogger(logger) as cl: + new_model = TFAutoModelForSequenceClassification.from_pretrained( + tmp_dir, num_labels=42, ignore_mismatched_sizes=True + ) + self.assertIn("the shapes did not match", cl.out) + + logits = new_model(**inputs).logits + self.assertEqual(logits.shape[1], 42) + + with CaptureLogger(logger) as cl: + new_model_without_prefix = TFAutoModel.from_pretrained( + tmp_dir, vocab_size=10, ignore_mismatched_sizes=True + ) + self.assertIn("the shapes did not match", cl.out) + + # Although Tf models always have a prefix pointing to `MainLayer`, + # we still add this "without prefix" test to keep a consistency between tf and pt tests. + input_ids = ids_tensor((2, 8), 10) + if self.is_encoder_decoder: + new_model_without_prefix(input_ids, decoder_input_ids=input_ids) + else: + new_model_without_prefix(input_ids) + + def test_model_main_input_name(self): + for model_class in self.all_model_classes: + model_signature = inspect.signature(getattr(model_class, "call")) + # The main input is the name of the argument after `self` + observed_main_input_name = list(model_signature.parameters.keys())[1] + self.assertEqual(model_class.main_input_name, observed_main_input_name) + + def _generate_random_bad_tokens(self, num_bad_tokens, model): + # special tokens cannot be bad tokens + special_tokens = [] + if model.config.bos_token_id is not None: + special_tokens.append(model.config.bos_token_id) + if model.config.pad_token_id is not None: + special_tokens.append(model.config.pad_token_id) + if model.config.eos_token_id is not None: + special_tokens.append(model.config.eos_token_id) + + # create random bad tokens that are not special tokens + bad_tokens = [] + while len(bad_tokens) < num_bad_tokens: + token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0] + if token not in special_tokens: + bad_tokens.append(token) + return bad_tokens + + def _check_generated_ids(self, output_ids): + for token_id in output_ids[0].numpy().tolist(): + self.assertGreaterEqual(token_id, 0) + self.assertLess(token_id, self.model_tester.vocab_size) + + def _check_match_tokens(self, generated_ids, bad_words_ids): + # for all bad word tokens + for bad_word_ids in bad_words_ids: + # for all slices in batch + for generated_ids_slice in generated_ids: + # for all word idx + for i in range(len(bad_word_ids), len(generated_ids_slice)): + # if tokens match + if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids: + return True + return False + + +def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): + """Creates a random int32 tensor of the shape within the vocab size.""" + if rng is None: + rng = random.Random() + + total_dims = 1 + for dim in shape: + total_dims *= dim + + values = [] + for _ in range(total_dims): + values.append(rng.randint(0, vocab_size - 1)) + + output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32) + + return output + + +def random_attention_mask(shape, rng=None, name=None, dtype=None): + attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype) + # make sure that at least one token is attended to for each batch + attn_mask = tf.concat( + [ + tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), + attn_mask[:, 1:], + ], + axis=1, + ) + return attn_mask - This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the - tensors in the first argument of the model call function: `model(inputs)`. - +def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None): + """Creates a random float32 tensor""" + if rng is None: + rng = random.Random() - Parameters: - config ([`ConvNextConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. -""" + total_dims = 1 + for dim in shape: + total_dims *= dim -CONVNEXT_INPUTS_DOCSTRING = r""" - Args: - pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): - Pixel values. Pixel values can be obtained using [`ConvNextFeatureExtractor`]. See - [`ConvNextFeatureExtractor.__call__`] for details. + values = [] + for _ in range(total_dims): + values.append(rng.random() * scale) - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. This argument can be used only in eager mode, in graph mode the value in the config will be - used instead. - return_dict (`bool`, *optional*): - Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used - in eager mode, in graph mode the value will always be set to True. -""" + return tf.reshape( + tf.constant(values, dtype=dtype if dtype is not None else tf.float32), + shape=shape, + ) -@add_start_docstrings( - "The bare ConvNext model outputting raw features without any specific head on top.", - CONVNEXT_START_DOCSTRING, -) -class TFConvNextModel(TFConvNextPreTrainedModel): - def __init__(self, config, *inputs, add_pooling_layer=True, **kwargs): - super().__init__(config, *inputs, **kwargs) - self.convnext = TFConvNextMainLayer(config, add_pooling_layer=add_pooling_layer, name="convnext") - - @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC) - def call( - self, - pixel_values: Optional[TFModelInputType] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - training: bool = False, - **kwargs, - ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: - r""" - Returns: - - Examples: - - ```python - >>> from transformers import ConvNextFeatureExtractor, TFConvNextModel - >>> from PIL import Image - >>> import requests - - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> feature_extractor = ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") - >>> model = TFConvNextModel.from_pretrained("facebook/convnext-tiny-224") - - >>> inputs = feature_extractor(images=image, return_tensors="tf") - >>> outputs = model(**inputs) - >>> last_hidden_states = outputs.last_hidden_state - ```""" - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - inputs = input_processing( - func=self.call, - config=self.config, - input_ids=pixel_values, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - training=training, - kwargs_call=kwargs, +@require_tf +class UtilsFunctionsTest(unittest.TestCase): + + # tests whether the top_k_top_p_filtering function behaves as expected + def test_top_k_top_p_filtering(self): + logits = tf.convert_to_tensor( + [ + [ + 8.2220991, # 3rd highest value; idx. 0 + -0.5620044, + 5.23229752, + 4.0386393, + -6.8798378, + -0.54785802, + -3.2012153, + 2.92777176, + 1.88171953, + 7.35341276, # 5th highest value; idx. 9 + 8.43207833, # 2nd highest value; idx. 10 + -9.85711836, + -5.96209236, + -1.13039161, + -7.1115294, + -0.8369633, + -5.3186408, + 7.06427407, + 0.81369344, + -0.82023817, + -5.9179796, + 0.58813443, + -6.99778438, + 4.71551189, + -0.18771637, + 7.44020759, # 4th highest value; idx. 25 + 9.38450987, # 1st highest value; idx. 26 + 2.12662941, + -9.32562038, + 2.35652522, + ], # cummulative prob of 5 highest values <= 0.6 + [ + 0.58425518, + 4.53139238, + -5.57510464, + -6.28030699, + -7.19529503, + -4.02122551, + 1.39337037, + -6.06707057, + 1.59480517, + -9.643119, + 0.03907799, + 0.67231762, + -8.88206726, + 6.27115922, # 4th highest value; idx. 13 + 2.28520723, + 4.82767506, + 4.30421368, + 8.8275313, # 2nd highest value; idx. 17 + 5.44029958, # 5th highest value; idx. 18 + -4.4735794, + 7.38579536, # 3rd highest value; idx. 20 + -2.91051663, + 2.61946077, + -2.5674762, + -9.48959302, + -4.02922645, + -1.35416918, + 9.67702323, # 1st highest value; idx. 27 + -5.89478553, + 1.85370467, + ], # cummulative prob of 5 highest values <= 0.6 + ], + dtype=tf.float32, ) - if "input_ids" in inputs: - inputs["pixel_values"] = inputs.pop("input_ids") - - if inputs["pixel_values"] is None: - raise ValueError("You have to specify pixel_values") - - outputs = self.convnext( - pixel_values=inputs["pixel_values"], - output_hidden_states=output_hidden_states, - return_dict=return_dict, - training=inputs["training"], - ) + non_inf_expected_idx = tf.convert_to_tensor( + [ + [0, 0], + [0, 9], + [0, 10], + [0, 25], + [0, 26], + [1, 13], + [1, 17], + [1, 18], + [1, 20], + [1, 27], + ], + dtype=tf.int32, + ) # expected non filtered idx as noted above + + non_inf_expected_output = tf.convert_to_tensor( + [ + 8.222099, + 7.3534126, + 8.432078, + 7.4402075, + 9.38451, + 6.271159, + 8.827531, + 5.4402995, + 7.3857956, + 9.677023, + ], + dtype=tf.float32, + ) # expected non filtered values as noted above - # converts back NHWC -> NCHW, to match PT's output - if not return_dict: - return (tf.transpose(outputs[0], perm=(0, 3, 1, 2)),) + outputs[1:] + output = tf_top_k_top_p_filtering(logits, top_k=10, top_p=0.6, min_tokens_to_keep=4) - return TFBaseModelOutputWithPooling( - last_hidden_state=tf.transpose(outputs.last_hidden_state, perm=(0, 3, 1, 2)), - pooler_output=outputs.pooler_output, - hidden_states=outputs.hidden_states, + non_inf_output = output[output != -float("inf")] + non_inf_idx = tf.cast( + tf.where(tf.not_equal(output, tf.constant(-float("inf"), dtype=tf.float32))), + dtype=tf.int32, ) - -@add_start_docstrings( - """ - ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for - ImageNet. - """, - CONVNEXT_START_DOCSTRING, -) -class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClassificationLoss): - def __init__(self, config: ConvNextConfig, *inputs, **kwargs): - super().__init__(config, *inputs, **kwargs) - - self.num_labels = config.num_labels - self.convnext = TFConvNextMainLayer(config, name="convnext") - - # Classifier head - self.classifier = tf.keras.layers.Dense( - units=config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - bias_initializer="zeros", - name="classifier", + tf.debugging.assert_near(non_inf_output, non_inf_expected_output, rtol=1e-12) + tf.debugging.assert_equal(non_inf_idx, non_inf_expected_idx) + + +@require_tf +@is_staging_test +class TFModelPushToHubTester(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls._token = login(username=USER, password=PASS) + + @classmethod + def tearDownClass(cls): + try: + delete_repo(token=cls._token, name="test-model-tf") + except HTTPError: + pass + + try: + delete_repo( + token=cls._token, + name="test-model-tf-org", + organization="valid_org", + ) + except HTTPError: + pass + + def test_push_to_hub(self): + config = BertConfig( + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, ) + model = TFBertModel(config) + # Make sure model is properly initialized + _ = model(model.dummy_inputs) + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained( + os.path.join(tmp_dir, "test-model-tf"), + push_to_hub=True, + use_auth_token=self._token, + ) - @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) - def call( - self, - pixel_values: Optional[TFModelInputType] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - labels: Optional[Union[np.ndarray, tf.Tensor]] = None, - training: Optional[bool] = False, - **kwargs, - ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]: - r""" - labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): - Labels for computing the image classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - - Returns: - - Examples: - - ```python - >>> from transformers import ConvNextFeatureExtractor, TFConvNextForImageClassification - >>> import tensorflow as tf - >>> from PIL import Image - >>> import requests - - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> feature_extractor = ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") - >>> model = TFViTForImageClassification.from_pretrained("facebook/convnext-tiny-224") - - >>> inputs = feature_extractor(images=image, return_tensors="tf") - >>> outputs = model(**inputs) - >>> logits = outputs.logits - >>> # model predicts one of the 1000 ImageNet classes - >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0] - >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)]) - ```""" - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf") + models_equal = True + for p1, p2 in zip(model.weights, new_model.weights): + if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: + models_equal = False + self.assertTrue(models_equal) + + def test_push_to_hub_with_model_card(self): + config = BertConfig( + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - inputs = input_processing( - func=self.call, - config=self.config, - input_ids=pixel_values, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - labels=labels, - training=training, - kwargs_call=kwargs, + model = TFBertModel(config) + with tempfile.TemporaryDirectory() as tmp_dir: + model.push_to_hub(os.path.join(tmp_dir, "test-model-tf")) + self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "test-model-card-tf", "README.md"))) + + def test_push_to_hub_in_organization(self): + config = BertConfig( + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, ) + model = TFBertModel(config) + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained( + os.path.join(tmp_dir, "test-model-tf-org"), + push_to_hub=True, + use_auth_token=self._token, + organization="valid_org", + ) - if "input_ids" in inputs: - inputs["pixel_values"] = inputs.pop("input_ids") - - if inputs["pixel_values"] is None: - raise ValueError("You have to specify pixel_values") - - outputs = self.convnext( - inputs["pixel_values"], - output_hidden_states=output_hidden_states, - return_dict=return_dict, - training=inputs["training"], - ) - - pooled_output = outputs.pooler_output if return_dict else outputs[1] - - logits = self.classifier(pooled_output) - loss = None if inputs["labels"] is None else self.hf_compute_loss(labels=inputs["labels"], logits=logits) - - if not inputs["return_dict"]: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - - return TFSequenceClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - ) + new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org") + models_equal = True + for p1, p2 in zip(model.weights, new_model.weights): + if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: + models_equal = False + self.assertTrue(models_equal) diff --git a/tests/convnext/test_modeling_tf_convnext.py b/tests/convnext/test_modeling_tf_convnext.py index 233ec6662b820..6f8c142b654d8 100644 --- a/tests/convnext/test_modeling_tf_convnext.py +++ b/tests/convnext/test_modeling_tf_convnext.py @@ -16,6 +16,7 @@ import inspect import unittest +from typing import List, Tuple from transformers import ConvNextConfig from transformers.file_utils import cached_property, is_tf_available, is_vision_available @@ -222,6 +223,50 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) + # Since ConvNext does not have any attention we need to rewrite this test. + def test_model_outputs_equivalence(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (List, Tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + all(tf.equal(tuple_object, dict_object)), + msg=f"Tuple and dict output are not equal. Difference: {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}", + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + model = model_class(config) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + def test_for_image_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_image_classification(*config_and_inputs) diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index e072b4febd90b..2038f29e56cf8 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -83,7 +83,8 @@ # Restrict TensorFlow to only allocate x GB of memory on the GPUs try: tf.config.set_logical_device_configuration( - gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)] + gpu, + [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)], ) logical_gpus = tf.config.list_logical_devices("GPU") print("Logical GPUs", logical_gpus) @@ -116,7 +117,10 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): inputs_dict = { - k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1)) + k: tf.tile( + tf.expand_dims(v, 1), + (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1), + ) if isinstance(v, tf.Tensor) and v.ndim > 0 else v for k, v in inputs_dict.items() @@ -144,7 +148,11 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING), ]: inputs_dict["labels"] = tf.zeros( - (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32 + ( + self.model_tester.batch_size, + self.model_tester.seq_length, + ), + dtype=tf.int32, ) return inputs_dict @@ -152,7 +160,10 @@ def test_initialization(self): pass def test_save_load(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -166,7 +177,10 @@ def test_save_load(self): self.assert_outputs_same(after_outputs, outputs) def test_save_load_config(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -218,7 +232,10 @@ def test_onnx_compliancy(self): if not self.test_onnx: return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() INTERNAL_OPS = [ "Assert", "AssignVariableOp", @@ -265,7 +282,10 @@ def test_onnx_runtime_optimize(self): import onnxruntime import tf2onnx - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -276,7 +296,10 @@ def test_onnx_runtime_optimize(self): onnxruntime.InferenceSession(onnx_model_proto.SerializeToString()) def test_keras_save_load(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() tf_main_layer_classes = set( module_member @@ -321,7 +344,8 @@ def test_keras_save_load(self): ) else: model = tf.keras.models.load_model( - filepath, custom_objects={main_layer_class.__name__: main_layer_class} + filepath, + custom_objects={main_layer_class.__name__: main_layer_class}, ) assert isinstance(model, tf.keras.Model) after_outputs = model(inputs_dict) @@ -348,7 +372,10 @@ def test_pt_tf_model_equivalence(self): import transformers - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beginning @@ -361,7 +388,9 @@ def test_pt_tf_model_equivalence(self): # Check we can load pt model in tf and vice-versa with model => model functions tf_model = transformers.load_pytorch_model_in_tf2_model( - tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class) + tf_model, + pt_model, + tf_inputs=self._prepare_for_class(inputs_dict, model_class), ) pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) @@ -382,7 +411,10 @@ def test_pt_tf_model_equivalence(self): with torch.no_grad(): pto = pt_model(**pt_inputs_dict) - tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False) + tfo = tf_model( + self._prepare_for_class(inputs_dict, model_class), + training=False, + ) tf_hidden_states = tfo[0].numpy() pt_hidden_states = pto[0].numpy() @@ -441,14 +473,20 @@ def test_pt_tf_model_equivalence(self): self.assertLessEqual(max_diff, 4e-2) def test_compile_tf_model(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() max_input = getattr(self.model_tester, "max_position_embeddings", 512) optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") for model_class in self.all_model_classes: - if model_class.__name__ in ["TFSpeech2TextModel", "TFSpeech2TextForConditionalGeneration"]: + if model_class.__name__ in [ + "TFSpeech2TextModel", + "TFSpeech2TextForConditionalGeneration", + ]: inputs = { "decoder_input_ids": tf.keras.Input( batch_shape=(2, max_input), @@ -472,7 +510,11 @@ def test_compile_tf_model(self): name="decoder_input_ids", dtype="int32", ), - "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"), + "input_ids": tf.keras.Input( + batch_shape=(2, max_input), + name="input_ids", + dtype="int32", + ), } # `pixel_values` implies that the input is an image elif model_class.main_input_name == "pixel_values": @@ -488,7 +530,11 @@ def test_compile_tf_model(self): ) elif model_class.__name__ in ["TFCLIPModel"]: inputs = { - "input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"), + "input_ids": tf.keras.Input( + batch_shape=(3, max_input), + name="input_ids", + dtype="int32", + ), "pixel_values": tf.keras.Input( batch_shape=( 3, @@ -501,7 +547,11 @@ def test_compile_tf_model(self): ), } elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): - inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32") + inputs = tf.keras.Input( + batch_shape=(4, 2, max_input), + name="input_ids", + dtype="int32", + ) else: inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32") @@ -524,7 +574,10 @@ def test_compile_tf_model(self): extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) def test_keyword_and_dict_args(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -540,10 +593,21 @@ def test_keyword_and_dict_args(self): self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6) def test_attention_outputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True - decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length) - encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length) + decoder_seq_length = getattr( + self.model_tester, + "decoder_seq_length", + self.model_tester.seq_length, + ) + encoder_seq_length = getattr( + self.model_tester, + "encoder_seq_length", + self.model_tester.seq_length, + ) decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length) encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) @@ -554,7 +618,11 @@ def check_decoder_attentions_output(outputs): self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(decoder_attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + decoder_key_length, + ], ) def check_encoder_attentions_output(outputs): @@ -564,7 +632,11 @@ def check_encoder_attentions_output(outputs): self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + [ + self.model_tester.num_attention_heads, + encoder_seq_length, + encoder_key_length, + ], ) for model_class in self.all_model_classes: @@ -606,7 +678,10 @@ def test_headmasking(self): return random.Random().seed(42) - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() random.Random().seed() inputs_dict["output_attentions"] = True @@ -619,11 +694,19 @@ def test_headmasking(self): def prepare_layer_head_mask(i, attention_heads, num_hidden_layers): if i == 0: return tf.concat( - (tf.zeros(1, dtype=tf.float32), tf.ones(attention_heads - 1, dtype=tf.float32)), 0 + ( + tf.zeros(1, dtype=tf.float32), + tf.ones(attention_heads - 1, dtype=tf.float32), + ), + 0, ) elif i == num_hidden_layers - 1: return tf.concat( - (tf.zeros(attention_heads - 1, dtype=tf.float32), tf.ones(1, dtype=tf.float32)), 0 + ( + tf.zeros(attention_heads - 1, dtype=tf.float32), + tf.ones(1, dtype=tf.float32), + ), + 0, ) else: return tf.ones(attention_heads, dtype=tf.float32) @@ -652,7 +735,8 @@ def check_attentions_validity(attentions): # Remove Nan for t in attentions: self.assertLess( - (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), (tf.size(t) / 4).numpy() + (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), + (tf.size(t) / 4).numpy(), ) # Check we don't have more than 25% nans (arbitrary) attentions = [ @@ -660,11 +744,23 @@ def check_attentions_validity(attentions): ] # remove them (the test is less complete) self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0) - self.assertNotEqual(tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), 0.0) + self.assertNotEqual( + tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), + 0.0, + ) if len(attentions) > 2: # encoder-decodere models have only 2 layers in each modules - self.assertNotEqual(tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), 0.0) - self.assertAlmostEqual(tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), 0.0) - self.assertNotEqual(tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), 0.0) + self.assertNotEqual( + tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), + 0.0, + ) + self.assertAlmostEqual( + tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), + 0.0, + ) + self.assertNotEqual( + tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), + 0.0, + ) if model.config.is_encoder_decoder: check_attentions_validity(outputs.encoder_attentions) @@ -675,13 +771,18 @@ def check_attentions_validity(attentions): check_attentions_validity(outputs.attentions) def test_hidden_states_output(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() def check_hidden_states_output(config, inputs_dict, model_class): model = model_class(config) outputs = model(self._prepare_for_class(inputs_dict, model_class)) expected_num_layers = getattr( - self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + self.model_tester, + "expected_num_hidden_layers", + self.model_tester.num_hidden_layers + 1, ) if model.config.is_encoder_decoder: @@ -692,12 +793,18 @@ def check_hidden_states_output(config, inputs_dict, model_class): self.assertEqual(len(encoder_hidden_states), expected_num_layers) self.assertListEqual( list(encoder_hidden_states[0].shape[-2:]), - [self.model_tester.seq_length, self.model_tester.hidden_size], + [ + self.model_tester.seq_length, + self.model_tester.hidden_size, + ], ) self.assertEqual(len(decoder_hidden_states), expected_num_layers) self.assertListEqual( list(decoder_hidden_states[0].shape[-2:]), - [self.model_tester.seq_length, self.model_tester.hidden_size], + [ + self.model_tester.seq_length, + self.model_tester.hidden_size, + ], ) else: hidden_states = outputs.hidden_states @@ -705,7 +812,10 @@ def check_hidden_states_output(config, inputs_dict, model_class): self.assertEqual(len(hidden_states), expected_num_layers) self.assertListEqual( list(hidden_states[0].shape[-2:]), - [self.model_tester.seq_length, self.model_tester.hidden_size], + [ + self.model_tester.seq_length, + self.model_tester.hidden_size, + ], ) for model_class in self.all_model_classes: @@ -717,7 +827,10 @@ def check_hidden_states_output(config, inputs_dict, model_class): check_hidden_states_output(config, inputs_dict, model_class) def test_model_common_attributes(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() text_in_text_out_models = ( get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING) + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING) @@ -747,13 +860,22 @@ def test_model_common_attributes(self): assert name is None def test_determinism(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) first, second = ( - model(self._prepare_for_class(inputs_dict, model_class), training=False)[0], - model(self._prepare_for_class(inputs_dict, model_class), training=False)[0], + model( + self._prepare_for_class(inputs_dict, model_class), + training=False, + )[0], + model( + self._prepare_for_class(inputs_dict, model_class), + training=False, + )[0], ) out_1 = first.numpy() out_2 = second.numpy() @@ -764,7 +886,10 @@ def test_determinism(self): def test_model_outputs_equivalence(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs) @@ -799,30 +924,32 @@ def recursive_check(tuple_object, dict_object): dict_inputs = self._prepare_for_class(inputs_dict, model_class) check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) - # Pure conv models (such as ConvNeXt) don't have `output_attentions`. - if config.output_attentions: - tuple_inputs = self._prepare_for_class(inputs_dict, model_class) - dict_inputs = self._prepare_for_class(inputs_dict, model_class) - check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) - if config.output_attentions: - tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) - if config.output_attentions: - tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - check_equivalence( - model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} - ) + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence( + model, + tuple_inputs, + dict_inputs, + {"output_hidden_states": True, "output_attentions": True}, + ) def test_inputs_embeds(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -849,7 +976,10 @@ def test_inputs_embeds(self): model(inputs) def test_numpy_arrays_inputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() def prepare_numpy_arrays(inputs_dict): inputs_np_dict = {} @@ -874,7 +1004,10 @@ def prepare_numpy_arrays(inputs_dict): def test_resize_token_embeddings(self): if not self.test_resize_embeddings: return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() def _get_word_embedding_weight(model, embedding_layer): embeds = getattr(embedding_layer, "weight", None) @@ -933,16 +1066,25 @@ def _get_word_embedding_weight(model, embedding_layer): if old_output_embeddings is not None and new_output_embeddings is not None: self.assertEqual(new_output_embeddings.shape[0], assert_size) - self.assertEqual(new_output_embeddings.shape[1], old_output_embeddings.shape[1]) + self.assertEqual( + new_output_embeddings.shape[1], + old_output_embeddings.shape[1], + ) models_equal = True - for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()): + for p1, p2 in zip( + old_output_embeddings.value(), + new_output_embeddings.value(), + ): if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: models_equal = False self.assertTrue(models_equal) def test_lm_head_model_random_no_beam_search_generate(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) # iterate over all generative models @@ -969,16 +1111,25 @@ def test_lm_head_model_random_no_beam_search_generate(self): # check bad words tokens language generation # create list of 1-seq bad token and list of 2-seq of bad tokens - bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)] + bad_words_ids = [ + self._generate_random_bad_tokens(1, model), + self._generate_random_bad_tokens(2, model), + ] output_tokens = model.generate( - input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2 + input_ids, + do_sample=True, + bad_words_ids=bad_words_ids, + num_return_sequences=2, ) # only count generated tokens generated_ids = output_tokens[:, input_ids.shape[-1] :] self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) def test_lm_head_model_no_beam_search_generate_dict_outputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) if input_ids is None: input_ids = inputs_dict.get("input_features", None) @@ -1011,7 +1162,10 @@ def test_lm_head_model_no_beam_search_generate_dict_outputs(self): self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput) def test_lm_head_model_random_beam_search_generate(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) for model_class in self.all_generative_model_classes: @@ -1026,7 +1180,12 @@ def test_lm_head_model_random_beam_search_generate(self): with self.assertRaises(AssertionError): # generating more sequences than having beams leads is not possible - model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2) + model.generate( + input_ids, + do_sample=False, + num_return_sequences=3, + num_beams=2, + ) # num_return_sequences > 1, sample self._check_generated_ids( @@ -1038,20 +1197,37 @@ def test_lm_head_model_random_beam_search_generate(self): ) ) # num_return_sequences > 1, greedy - self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2)) + self._check_generated_ids( + model.generate( + input_ids, + do_sample=False, + num_beams=2, + num_return_sequences=2, + ) + ) # check bad words tokens language generation # create list of 1-seq bad token and list of 2-seq of bad tokens - bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)] + bad_words_ids = [ + self._generate_random_bad_tokens(1, model), + self._generate_random_bad_tokens(2, model), + ] output_tokens = model.generate( - input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2 + input_ids, + do_sample=False, + bad_words_ids=bad_words_ids, + num_beams=2, + num_return_sequences=2, ) # only count generated tokens generated_ids = output_tokens[:, input_ids.shape[-1] :] self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) def test_lm_head_model_beam_search_generate_dict_outputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) if input_ids is None: input_ids = inputs_dict.get("input_features", None) @@ -1086,14 +1262,20 @@ def test_lm_head_model_beam_search_generate_dict_outputs(self): self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput) def test_loss_computation(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) if getattr(model, "hf_compute_loss", None): # The number of elements in the loss should be the same as the number of elements in the label prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) added_label = prepared_for_class[ - sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0] + sorted( + list(prepared_for_class.keys() - inputs_dict.keys()), + reverse=True, + )[0] ] loss_size = tf.size(added_label) @@ -1104,7 +1286,11 @@ def test_loss_computation(self): # Test that model correctly compute the loss with kwargs prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) - possible_input_names = {"input_ids", "pixel_values", "input_features"} + possible_input_names = { + "input_ids", + "pixel_values", + "input_features", + } input_name = possible_input_names.intersection(set(prepared_for_class)).pop() model_input = prepared_for_class.pop(input_name) @@ -1148,8 +1334,15 @@ def test_loss_computation(self): self.assertEqual(loss.shape, [loss_size]) def test_generate_with_headmasking(self): - attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"] - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + attention_names = [ + "encoder_attentions", + "decoder_attentions", + "cross_attentions", + ] + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_generative_model_classes: model = model_class(config) @@ -1184,7 +1377,10 @@ def test_generate_with_headmasking(self): def test_load_with_mismatched_shapes(self): if not self.test_mismatched_shapes: return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): @@ -1291,7 +1487,13 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): def random_attention_mask(shape, rng=None, name=None, dtype=None): attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype) # make sure that at least one token is attended to for each batch - attn_mask = tf.concat([tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), attn_mask[:, 1:]], axis=1) + attn_mask = tf.concat( + [ + tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), + attn_mask[:, 1:], + ], + axis=1, + ) return attn_mask @@ -1308,7 +1510,10 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None): for _ in range(total_dims): values.append(rng.random() * scale) - return tf.reshape(tf.constant(values, dtype=dtype if dtype is not None else tf.float32), shape=shape) + return tf.reshape( + tf.constant(values, dtype=dtype if dtype is not None else tf.float32), + shape=shape, + ) @require_tf @@ -1387,12 +1592,34 @@ def test_top_k_top_p_filtering(self): ) non_inf_expected_idx = tf.convert_to_tensor( - [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]], + [ + [0, 0], + [0, 9], + [0, 10], + [0, 25], + [0, 26], + [1, 13], + [1, 17], + [1, 18], + [1, 20], + [1, 27], + ], dtype=tf.int32, ) # expected non filtered idx as noted above non_inf_expected_output = tf.convert_to_tensor( - [8.222099, 7.3534126, 8.432078, 7.4402075, 9.38451, 6.271159, 8.827531, 5.4402995, 7.3857956, 9.677023], + [ + 8.222099, + 7.3534126, + 8.432078, + 7.4402075, + 9.38451, + 6.271159, + 8.827531, + 5.4402995, + 7.3857956, + 9.677023, + ], dtype=tf.float32, ) # expected non filtered values as noted above @@ -1423,19 +1650,31 @@ def tearDownClass(cls): pass try: - delete_repo(token=cls._token, name="test-model-tf-org", organization="valid_org") + delete_repo( + token=cls._token, + name="test-model-tf-org", + organization="valid_org", + ) except HTTPError: pass def test_push_to_hub(self): config = BertConfig( - vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, ) model = TFBertModel(config) # Make sure model is properly initialized _ = model(model.dummy_inputs) with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained(os.path.join(tmp_dir, "test-model-tf"), push_to_hub=True, use_auth_token=self._token) + model.save_pretrained( + os.path.join(tmp_dir, "test-model-tf"), + push_to_hub=True, + use_auth_token=self._token, + ) new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf") models_equal = True @@ -1446,7 +1685,11 @@ def test_push_to_hub(self): def test_push_to_hub_with_model_card(self): config = BertConfig( - vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, ) model = TFBertModel(config) with tempfile.TemporaryDirectory() as tmp_dir: @@ -1455,7 +1698,11 @@ def test_push_to_hub_with_model_card(self): def test_push_to_hub_in_organization(self): config = BertConfig( - vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, ) model = TFBertModel(config) with tempfile.TemporaryDirectory() as tmp_dir: From 98111f8500b16b258191775e493cb6dd8ce5e37f Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 24 Feb 2022 13:34:05 +0530 Subject: [PATCH 51/65] fix: convnext tests. --- .../models/convnext/modeling_tf_convnext.py | 2205 +++++------------ tests/convnext/test_modeling_tf_convnext.py | 4 +- 2 files changed, 554 insertions(+), 1655 deletions(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 2038f29e56cf8..328194dddbc2c 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2019 HuggingFace Inc. +# Copyright 2022 Meta Platforms Inc. and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,1710 +12,609 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" TF 2.0 ConvNext model.""" -import copy -import inspect -import json -import os -import random -import tempfile -import unittest -from importlib import import_module -from typing import List, Tuple - -from huggingface_hub import delete_repo, login -from requests.exceptions import HTTPError -from transformers import is_tf_available -from transformers.models.auto import get_values -from transformers.testing_utils import tooslow # noqa: F401 -from transformers.testing_utils import ( - PASS, - USER, - CaptureLogger, - _tf_gpu_memory_limit, - is_pt_tf_cross_test, - is_staging_test, - require_tf, - require_tf2onnx, - slow, +from typing import Dict, Optional, Tuple, Union + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings +from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling, TFSequenceClassifierOutput +from ...modeling_tf_utils import ( + TFModelInputType, + TFPreTrainedModel, + TFSequenceClassificationLoss, + get_initializer, + input_processing, + keras_serializable, ) -from transformers.utils import logging - - -if is_tf_available(): - import numpy as np - import tensorflow as tf - - from transformers import ( - TF_MODEL_FOR_CAUSAL_LM_MAPPING, - TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, - TF_MODEL_FOR_MASKED_LM_MAPPING, - TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING, - TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, - TF_MODEL_FOR_PRETRAINING_MAPPING, - TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, - TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, - TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, - TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, - TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, - BertConfig, - TFAutoModel, - TFAutoModelForSequenceClassification, - TFBertModel, - TFSharedEmbeddings, - tf_top_k_top_p_filtering, - ) - from transformers.generation_tf_utils import ( - TFBeamSampleDecoderOnlyOutput, - TFBeamSampleEncoderDecoderOutput, - TFBeamSearchDecoderOnlyOutput, - TFBeamSearchEncoderDecoderOutput, - TFGreedySearchDecoderOnlyOutput, - TFGreedySearchEncoderDecoderOutput, - TFSampleDecoderOnlyOutput, - TFSampleEncoderDecoderOutput, - ) +from ...utils import logging +from .configuration_convnext import ConvNextConfig - if _tf_gpu_memory_limit is not None: - gpus = tf.config.list_physical_devices("GPU") - for gpu in gpus: - # Restrict TensorFlow to only allocate x GB of memory on the GPUs - try: - tf.config.set_logical_device_configuration( - gpu, - [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)], - ) - logical_gpus = tf.config.list_logical_devices("GPU") - print("Logical GPUs", logical_gpus) - except RuntimeError as e: - # Virtual devices must be set before GPUs have been initialized - print(e) - - -def _config_zero_init(config): - configs_no_init = copy.deepcopy(config) - for key in configs_no_init.__dict__.keys(): - if "_range" in key or "_std" in key: - setattr(configs_no_init, key, 0.0) - return configs_no_init - - -@require_tf -class TFModelTesterMixin: - - model_tester = None - all_model_classes = () - all_generative_model_classes = () - test_mismatched_shapes = True - test_resize_embeddings = True - test_head_masking = True - is_encoder_decoder = False - - def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict: - inputs_dict = copy.deepcopy(inputs_dict) - - if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): - inputs_dict = { - k: tf.tile( - tf.expand_dims(v, 1), - (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1), - ) - if isinstance(v, tf.Tensor) and v.ndim > 0 - else v - for k, v in inputs_dict.items() - } - if return_labels: - if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): - inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32) - elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING): - inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) - inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) - elif model_class in [ - *get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING), - *get_values(TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING), - ]: - inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) - elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING): - inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) - elif model_class in [ - *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING), - *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING), - *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING), - *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING), - *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING), - *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING), - ]: - inputs_dict["labels"] = tf.zeros( - ( - self.model_tester.batch_size, - self.model_tester.seq_length, - ), - dtype=tf.int32, - ) - return inputs_dict - - def test_initialization(self): - pass - - def test_save_load(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - outputs = model(self._prepare_for_class(inputs_dict, model_class)) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname, saved_model=False) - model = model_class.from_pretrained(tmpdirname) - after_outputs = model(self._prepare_for_class(inputs_dict, model_class)) - - self.assert_outputs_same(after_outputs, outputs) - - def test_save_load_config(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - outputs = model(self._prepare_for_class(inputs_dict, model_class)) - model_config = model.get_config() - # make sure that returned config is jsonifiable, which is required by keras - json.dumps(model_config) - new_model = model_class.from_config(model.get_config()) - # make sure it also accepts a normal config - _ = model_class.from_config(model.config) - _ = new_model(self._prepare_for_class(inputs_dict, model_class)) # Build model - new_model.set_weights(model.get_weights()) - after_outputs = new_model(self._prepare_for_class(inputs_dict, model_class)) - - self.assert_outputs_same(after_outputs, outputs) - - def test_forward_signature(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - signature = inspect.signature(model.call) - # signature.parameters is an OrderedDict => so arg_names order is deterministic - arg_names = [*signature.parameters.keys()] - - if model.config.is_encoder_decoder: - expected_arg_names = [ - "input_ids", - "attention_mask", - "decoder_input_ids", - "decoder_attention_mask", - ] - expected_arg_names.extend( - ["head_mask", "decoder_head_mask"] if "head_mask" and "decoder_head_mask" in arg_names else [] - ) - # Necessary to handle BART with newly added cross_attn_head_mask - expected_arg_names.extend( - ["cross_attn_head_mask", "encoder_outputs"] - if "cross_attn_head_mask" in arg_names - else ["encoder_outputs"] - ) - self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) - - else: - expected_arg_names = ["input_ids"] - self.assertListEqual(arg_names[:1], expected_arg_names) - - def test_onnx_compliancy(self): - if not self.test_onnx: - return - - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - INTERNAL_OPS = [ - "Assert", - "AssignVariableOp", - "EmptyTensorList", - "ReadVariableOp", - "ResourceGather", - "TruncatedNormal", - "VarHandleOp", - "VarIsInitializedOp", - ] - onnx_ops = [] +logger = logging.get_logger(__name__) + - with open(os.path.join(".", "utils", "tf_ops", "onnx.json")) as f: - onnx_opsets = json.load(f)["opsets"] +_CONFIG_FOR_DOC = "ConvNextConfig" +_CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224" - for i in range(1, self.onnx_min_opset + 1): - onnx_ops.extend(onnx_opsets[str(i)]) - for model_class in self.all_model_classes: - model_op_names = set() +class TFConvNextDropPath(tf.keras.layers.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + References: + (1) github.com:rwightman/pytorch-image-models + """ - with tf.Graph().as_default() as g: - model = model_class(config) - model(model.dummy_inputs) + def __init__(self, drop_path, **kwargs): + super().__init__(**kwargs) + self.drop_path = drop_path - for op in g.get_operations(): - model_op_names.add(op.node_def.op) + def call(self, x, training=None): + if training: + keep_prob = 1 - self.drop_path + shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) + random_tensor = keep_prob + tf.random.uniform(shape, 0, 1) + random_tensor = tf.floor(random_tensor) + return (x / keep_prob) * random_tensor + return x - model_op_names = sorted(model_op_names) - incompatible_ops = [] - for op in model_op_names: - if op not in onnx_ops and op not in INTERNAL_OPS: - incompatible_ops.append(op) +class TFConvNextEmbeddings(tf.keras.layers.Layer): + """This class is comparable to (and inspired by) the SwinEmbeddings class + found in src/transformers/models/swin/modeling_swin.py. + """ - self.assertEqual(len(incompatible_ops), 0, incompatible_ops) + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.patch_embeddings = tf.keras.layers.Conv2D( + filters=config.hidden_sizes[0], + kernel_size=config.patch_size, + strides=config.patch_size, + name="patch_embeddings", + kernel_initializer=get_initializer(config.initializer_range), + bias_initializer="zeros", + ) + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") + + def call(self, pixel_values): + if isinstance(pixel_values, dict): + pixel_values = pixel_values["pixel_values"] + + # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # So change the input format from `NCHW` to `NHWC`. + # shape = (batch_size, in_height, in_width, in_channels=num_channels) + pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) + + embeddings = self.patch_embeddings(pixel_values) + embeddings = self.layernorm(embeddings) + return embeddings + + +class TFConvNextLayer(tf.keras.layers.Layer): + """This corresponds to the `Block` class in the original implementation. + + There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C, + H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back + + The authors used (2) as they find it slightly faster in PyTorch. Since we already permuted the inputs to follow + NHWC ordering, we can just apply the operations straight-away without the permutation. + + Args: + config ([`ConvNextConfig`]): Model configuration class. + dim (`int`): Number of input channels. + drop_path (`float`): Stochastic depth rate. Default: 0.0. + """ + + def __init__(self, config, dim, drop_path=0.0, **kwargs): + super().__init__(**kwargs) + self.dim = dim + self.config = config + self.dwconv = tf.keras.layers.Conv2D( + filters=dim, + kernel_size=7, + padding="same", + groups=dim, + kernel_initializer=get_initializer(config.initializer_range), + bias_initializer="zeros", + name="dwconv", + ) # depthwise conv + self.layernorm = tf.keras.layers.LayerNormalization( + epsilon=1e-6, + name="layernorm", + ) + self.pwconv1 = tf.keras.layers.Dense( + units=4 * dim, + kernel_initializer=get_initializer(config.initializer_range), + bias_initializer="zeros", + name="pwconv1", + ) # pointwise/1x1 convs, implemented with linear layers + self.act = get_tf_activation(config.hidden_act) + self.pwconv2 = tf.keras.layers.Dense( + units=dim, + kernel_initializer=get_initializer(config.initializer_range), + bias_initializer="zeros", + name="pwconv2", + ) + # Using `layers.Activation` instead of `tf.identity` to better control `training` + # behaviour. + self.drop_path = ( + TFConvNextDropPath( + drop_path, + name="drop_path", + ) + if drop_path > 0.0 + else tf.keras.layers.Activation( + "linear", + name="drop_path", + ) + ) - @require_tf2onnx - @slow - def test_onnx_runtime_optimize(self): - if not self.test_onnx: - return + def build(self, input_shape: tf.TensorShape): + # PT's `nn.Parameters` must be mapped to a TF layer weight to inherit the same name hierarchy (and vice-versa) + self.layer_scale_parameter = ( + self.add_weight( + shape=(self.dim,), + initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value), + trainable=True, + name="layer_scale_parameter", + ) + if self.config.layer_scale_init_value > 0 + else None + ) + super().build(input_shape) + + def call(self, hidden_states, training=False): + input = hidden_states + x = self.dwconv(hidden_states) + x = self.layernorm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + + if self.layer_scale_parameter is not None: + x = self.layer_scale_parameter * x + + x = input + self.drop_path(x, training=training) + return x + + +class TFConvNextStage(tf.keras.layers.Layer): + """ConvNext stage, consisting of an optional downsampling layer + multiple residual blocks. + + Args: + config ([`ConvNextConfig`]): Model configuration class. + in_channels (`int`): Number of input channels. + out_channels (`int`): Number of output channels. + depth (`int`): Number of residual blocks. + drop_path_rates(`List[float]`): Stochastic depth rates for each layer. + """ + + def __init__( + self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs + ): + super().__init__(**kwargs) + if in_channels != out_channels or stride > 1: + self.downsampling_layer = [ + tf.keras.layers.LayerNormalization( + epsilon=1e-6, + name="downsampling_layer.0", + ), + # Inputs to this layer will follow NHWC format since we + # transposed the inputs from NCHW to NHWC in the `TFConvNextEmbeddings` + # layer. All the outputs throughout the model will be in NHWC + # from this point on until the output where we again change to + # NCHW. + tf.keras.layers.Conv2D( + filters=out_channels, + kernel_size=kernel_size, + strides=stride, + kernel_initializer=get_initializer(config.initializer_range), + bias_initializer="zeros", + name="downsampling_layer.1", + ), + ] + else: + self.downsampling_layer = [tf.identity] + + drop_path_rates = drop_path_rates or [0.0] * depth + self.layers = [ + TFConvNextLayer( + config, + dim=out_channels, + drop_path=drop_path_rates[j], + name=f"layers.{j}", + ) + for j in range(depth) + ] - import onnxruntime - import tf2onnx + def call(self, hidden_states): + for layer in self.downsampling_layer: + hidden_states = layer(hidden_states) + for layer in self.layers: + hidden_states = layer(hidden_states) + return hidden_states + + +class TFConvNextEncoder(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.stages = [] + drop_path_rates = [x for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))] + cur = 0 + prev_chs = config.hidden_sizes[0] + for i in range(config.num_stages): + out_chs = config.hidden_sizes[i] + stage = TFConvNextStage( + config, + in_channels=prev_chs, + out_channels=out_chs, + stride=2 if i > 0 else 1, + depth=config.depths[i], + drop_path_rates=drop_path_rates[cur], + name=f"stages.{i}", + ) + self.stages.append(stage) + cur += config.depths[i] + prev_chs = out_chs - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + def call(self, hidden_states, output_hidden_states=False, return_dict=True): + all_hidden_states = () if output_hidden_states else None - for model_class in self.all_model_classes: - model = model_class(config) - model(model.dummy_inputs) + for i, layer_module in enumerate(self.stages): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) - onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset) + hidden_states = layer_module(hidden_states) - onnxruntime.InferenceSession(onnx_model_proto.SerializeToString()) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) - def test_keras_save_load(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) - tf_main_layer_classes = set( - module_member - for model_class in self.all_model_classes - for module in (import_module(model_class.__module__),) - for module_member_name in dir(module) - if module_member_name.endswith("MainLayer") - # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`. - and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] - for module_member in (getattr(module, module_member_name),) - if isinstance(module_member, type) - and tf.keras.layers.Layer in module_member.__bases__ - and getattr(module_member, "_keras_serializable", False) + return TFBaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, ) - for main_layer_class in tf_main_layer_classes: - # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter - if "T5" in main_layer_class.__name__: - # Take the same values than in TFT5ModelTester for this shared layer - shared = TFSharedEmbeddings(99, 32, name="shared") - config.use_cache = inputs_dict.pop("use_cache", None) - main_layer = main_layer_class(config, embed_tokens=shared) - else: - main_layer = main_layer_class(config) - - symbolic_inputs = { - name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() - } - model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) - outputs = model(inputs_dict) - - with tempfile.TemporaryDirectory() as tmpdirname: - filepath = os.path.join(tmpdirname, "keras_model.h5") - model.save(filepath) - if "T5" in main_layer_class.__name__: - model = tf.keras.models.load_model( - filepath, - custom_objects={ - main_layer_class.__name__: main_layer_class, - "TFSharedEmbeddings": TFSharedEmbeddings, - }, - ) - else: - model = tf.keras.models.load_model( - filepath, - custom_objects={main_layer_class.__name__: main_layer_class}, - ) - assert isinstance(model, tf.keras.Model) - after_outputs = model(inputs_dict) - self.assert_outputs_same(after_outputs, outputs) - - def assert_outputs_same(self, after_outputs, outputs): - # Make sure we don't have nans - if isinstance(after_outputs, tf.Tensor): - out_1 = after_outputs.numpy() - elif isinstance(after_outputs, dict): - out_1 = after_outputs[list(after_outputs.keys())[0]].numpy() - else: - out_1 = after_outputs[0].numpy() - out_2 = outputs[0].numpy() - self.assertEqual(out_1.shape, out_2.shape) - out_1 = out_1[~np.isnan(out_1)] - out_2 = out_2[~np.isnan(out_2)] - max_diff = np.amax(np.abs(out_1 - out_2)) - self.assertLessEqual(max_diff, 1e-5) - - @is_pt_tf_cross_test - def test_pt_tf_model_equivalence(self): - import torch - - import transformers - - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beginning - pt_model_class = getattr(transformers, pt_model_class_name) - - config.output_hidden_states = True - - tf_model = model_class(config) - pt_model = pt_model_class(config) - - # Check we can load pt model in tf and vice-versa with model => model functions - tf_model = transformers.load_pytorch_model_in_tf2_model( - tf_model, - pt_model, - tf_inputs=self._prepare_for_class(inputs_dict, model_class), - ) - pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) - - # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences - pt_model.eval() - pt_inputs_dict = {} - for name, key in self._prepare_for_class(inputs_dict, model_class).items(): - if type(key) == bool: - pt_inputs_dict[name] = key - elif name == "input_values": - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) - elif name == "pixel_values": - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) - elif name == "input_features": - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) - else: - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) - - with torch.no_grad(): - pto = pt_model(**pt_inputs_dict) - tfo = tf_model( - self._prepare_for_class(inputs_dict, model_class), - training=False, - ) - tf_hidden_states = tfo[0].numpy() - pt_hidden_states = pto[0].numpy() - - tf_nans = np.copy(np.isnan(tf_hidden_states)) - pt_nans = np.copy(np.isnan(pt_hidden_states)) - - pt_hidden_states[tf_nans] = 0 - tf_hidden_states[tf_nans] = 0 - pt_hidden_states[pt_nans] = 0 - tf_hidden_states[pt_nans] = 0 - - max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states)) - self.assertLessEqual(max_diff, 4e-2) - - # Check we can load pt model in tf and vice-versa with checkpoint => model functions - with tempfile.TemporaryDirectory() as tmpdirname: - pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") - torch.save(pt_model.state_dict(), pt_checkpoint_path) - tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) - - tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") - tf_model.save_weights(tf_checkpoint_path) - pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) - - # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences - pt_model.eval() - pt_inputs_dict = {} - for name, key in self._prepare_for_class(inputs_dict, model_class).items(): - if type(key) == bool: - key = np.array(key, dtype=bool) - pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long) - elif name == "input_values": - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) - elif name == "pixel_values": - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) - elif name == "input_features": - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) - else: - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) - - with torch.no_grad(): - pto = pt_model(**pt_inputs_dict) - tfo = tf_model(self._prepare_for_class(inputs_dict, model_class)) - tfo = tfo[0].numpy() - pto = pto[0].numpy() - tf_nans = np.copy(np.isnan(tfo)) - pt_nans = np.copy(np.isnan(pto)) - - pto[tf_nans] = 0 - tfo[tf_nans] = 0 - pto[pt_nans] = 0 - tfo[pt_nans] = 0 - - max_diff = np.amax(np.abs(tfo - pto)) - self.assertLessEqual(max_diff, 4e-2) - - def test_compile_tf_model(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - max_input = getattr(self.model_tester, "max_position_embeddings", 512) - optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) - loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") - - for model_class in self.all_model_classes: - if model_class.__name__ in [ - "TFSpeech2TextModel", - "TFSpeech2TextForConditionalGeneration", - ]: - inputs = { - "decoder_input_ids": tf.keras.Input( - batch_shape=(2, max_input), - name="decoder_input_ids", - dtype="int32", - ), - "input_features": tf.keras.Input( - batch_shape=( - 2, - max_input, - self.model_tester.input_feat_per_channel * self.model_tester.input_channels, - ), - name="input_features", - dtype="float32", - ), - } - elif self.is_encoder_decoder: - inputs = { - "decoder_input_ids": tf.keras.Input( - batch_shape=(2, max_input), - name="decoder_input_ids", - dtype="int32", - ), - "input_ids": tf.keras.Input( - batch_shape=(2, max_input), - name="input_ids", - dtype="int32", - ), - } - # `pixel_values` implies that the input is an image - elif model_class.main_input_name == "pixel_values": - inputs = tf.keras.Input( - batch_shape=( - 3, - self.model_tester.num_channels, - self.model_tester.image_size, - self.model_tester.image_size, - ), - name="pixel_values", - dtype="float32", - ) - elif model_class.__name__ in ["TFCLIPModel"]: - inputs = { - "input_ids": tf.keras.Input( - batch_shape=(3, max_input), - name="input_ids", - dtype="int32", - ), - "pixel_values": tf.keras.Input( - batch_shape=( - 3, - self.model_tester.vision_model_tester.num_channels, - self.model_tester.vision_model_tester.image_size, - self.model_tester.vision_model_tester.image_size, - ), - name="pixel_values", - dtype="float32", - ), - } - elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): - inputs = tf.keras.Input( - batch_shape=(4, 2, max_input), - name="input_ids", - dtype="int32", - ) - else: - inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32") - - # Prepare our model - model = model_class(config) - model(self._prepare_for_class(inputs_dict, model_class)) # Model must be called before saving. - # Let's load it from the disk to be sure we can use pretrained weights - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname, saved_model=False) - model = model_class.from_pretrained(tmpdirname) - - outputs_dict = model(inputs) - hidden_states = outputs_dict[0] - - # Add a dense layer on top to test integration with other keras modules - outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states) - - # Compile extended model - extended_model = tf.keras.Model(inputs=[inputs], outputs=[outputs]) - extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) - - def test_keyword_and_dict_args(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - inputs = self._prepare_for_class(inputs_dict, model_class) - - outputs_dict = model(inputs) - - inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) - outputs_keywords = model(**inputs_keywords) - output_dict = outputs_dict[0].numpy() - output_keywords = outputs_keywords[0].numpy() - - self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6) - - def test_attention_outputs(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - config.return_dict = True - decoder_seq_length = getattr( - self.model_tester, - "decoder_seq_length", - self.model_tester.seq_length, +@keras_serializable +class TFConvNextMainLayer(tf.keras.layers.Layer): + config_class = ConvNextConfig + + def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.embeddings = TFConvNextEmbeddings(config, name="embeddings") + self.encoder = TFConvNextEncoder(config, name="encoder") + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + self.pooler = tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None + + def call( + self, + pixel_values: Optional[TFModelInputType] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + **kwargs, + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - encoder_seq_length = getattr( - self.model_tester, - "encoder_seq_length", - self.model_tester.seq_length, + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, ) - decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length) - encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) - - def check_decoder_attentions_output(outputs): - out_len = len(outputs) - self.assertEqual(min(out_len % 2, out_len % 5), 0) # differentiation due to newly added cross_attentions - decoder_attentions = outputs.decoder_attentions - self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) - self.assertListEqual( - list(decoder_attentions[0].shape[-3:]), - [ - self.model_tester.num_attention_heads, - decoder_seq_length, - decoder_key_length, - ], - ) - def check_encoder_attentions_output(outputs): - attentions = [ - t.numpy() for t in (outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions) - ] - self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) - self.assertListEqual( - list(attentions[0].shape[-3:]), - [ - self.model_tester.num_attention_heads, - encoder_seq_length, - encoder_key_length, - ], - ) + if "input_ids" in inputs: + inputs["pixel_values"] = inputs.pop("input_ids") - for model_class in self.all_model_classes: - inputs_dict["output_attentions"] = True - inputs_dict["use_cache"] = False - config.output_hidden_states = False - model = model_class(config) - outputs = model(self._prepare_for_class(inputs_dict, model_class)) - out_len = len(outputs) - self.assertEqual(config.output_hidden_states, False) - check_encoder_attentions_output(outputs) - - if self.is_encoder_decoder: - model = model_class(config) - outputs = model(self._prepare_for_class(inputs_dict, model_class)) - self.assertEqual(config.output_hidden_states, False) - check_decoder_attentions_output(outputs) - - # Check that output attentions can also be changed via the config - del inputs_dict["output_attentions"] - config.output_attentions = True - model = model_class(config) - outputs = model(self._prepare_for_class(inputs_dict, model_class)) - self.assertEqual(config.output_hidden_states, False) - check_encoder_attentions_output(outputs) - - # Check attention is always last and order is fine - inputs_dict["output_attentions"] = True - config.output_hidden_states = True - model = model_class(config) - outputs = model(self._prepare_for_class(inputs_dict, model_class)) - - self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs)) - self.assertEqual(model.config.output_hidden_states, True) - check_encoder_attentions_output(outputs) - - def test_headmasking(self): - if not self.test_head_masking: - return - - random.Random().seed(42) - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - random.Random().seed() - - inputs_dict["output_attentions"] = True - config.output_hidden_states = True - configs_no_init = _config_zero_init(config) # To be sure we have no Nan - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - - # Prepare head_mask - def prepare_layer_head_mask(i, attention_heads, num_hidden_layers): - if i == 0: - return tf.concat( - ( - tf.zeros(1, dtype=tf.float32), - tf.ones(attention_heads - 1, dtype=tf.float32), - ), - 0, - ) - elif i == num_hidden_layers - 1: - return tf.concat( - ( - tf.zeros(attention_heads - 1, dtype=tf.float32), - tf.ones(1, dtype=tf.float32), - ), - 0, - ) - else: - return tf.ones(attention_heads, dtype=tf.float32) - - head_mask = tf.stack( - [ - prepare_layer_head_mask(i, config.num_attention_heads, config.num_hidden_layers) - for i in range(config.num_hidden_layers) - ], - 0, - ) + if inputs["pixel_values"] is None: + raise ValueError("You have to specify pixel_values") - inputs = self._prepare_for_class(inputs_dict, model_class).copy() - inputs["head_mask"] = head_mask - if model.config.is_encoder_decoder: - signature = inspect.signature(model.call) - arg_names = [*signature.parameters.keys()] - if "decoder_head_mask" in arg_names: # necessary diferentiation because of T5 model - inputs["decoder_head_mask"] = head_mask - if "cross_attn_head_mask" in arg_names: - inputs["cross_attn_head_mask"] = head_mask - - outputs = model(**inputs, return_dict=True) - - def check_attentions_validity(attentions): - # Remove Nan - for t in attentions: - self.assertLess( - (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), - (tf.size(t) / 4).numpy(), - ) # Check we don't have more than 25% nans (arbitrary) - - attentions = [ - tf.where(tf.math.is_nan(t), 0.0, t) for t in attentions - ] # remove them (the test is less complete) - - self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0) - self.assertNotEqual( - tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), - 0.0, - ) - if len(attentions) > 2: # encoder-decodere models have only 2 layers in each modules - self.assertNotEqual( - tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), - 0.0, - ) - self.assertAlmostEqual( - tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), - 0.0, - ) - self.assertNotEqual( - tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), - 0.0, - ) - - if model.config.is_encoder_decoder: - check_attentions_validity(outputs.encoder_attentions) - check_attentions_validity(outputs.decoder_attentions) - if "cross_attn_head_mask" in arg_names: - check_attentions_validity(outputs.cross_attentions) - else: - check_attentions_validity(outputs.attentions) - - def test_hidden_states_output(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - def check_hidden_states_output(config, inputs_dict, model_class): - model = model_class(config) - outputs = model(self._prepare_for_class(inputs_dict, model_class)) - expected_num_layers = getattr( - self.model_tester, - "expected_num_hidden_layers", - self.model_tester.num_hidden_layers + 1, - ) + embedding_output = self.embeddings(inputs["pixel_values"], training=inputs["training"]) - if model.config.is_encoder_decoder: - encoder_hidden_states = outputs.encoder_hidden_states - decoder_hidden_states = outputs.decoder_hidden_states - - self.assertEqual(config.output_attentions, False) - self.assertEqual(len(encoder_hidden_states), expected_num_layers) - self.assertListEqual( - list(encoder_hidden_states[0].shape[-2:]), - [ - self.model_tester.seq_length, - self.model_tester.hidden_size, - ], - ) - self.assertEqual(len(decoder_hidden_states), expected_num_layers) - self.assertListEqual( - list(decoder_hidden_states[0].shape[-2:]), - [ - self.model_tester.seq_length, - self.model_tester.hidden_size, - ], - ) - else: - hidden_states = outputs.hidden_states - self.assertEqual(config.output_attentions, False) - self.assertEqual(len(hidden_states), expected_num_layers) - self.assertListEqual( - list(hidden_states[0].shape[-2:]), - [ - self.model_tester.seq_length, - self.model_tester.hidden_size, - ], - ) - - for model_class in self.all_model_classes: - inputs_dict["output_hidden_states"] = True - check_hidden_states_output(config, inputs_dict, model_class) - - del inputs_dict["output_hidden_states"] - config.output_hidden_states = True - check_hidden_states_output(config, inputs_dict, model_class) - - def test_model_common_attributes(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - text_in_text_out_models = ( - get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING) - + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING) - + get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING) + encoder_outputs = self.encoder( + embedding_output, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=inputs["training"], ) - speech_in_text_out_models = get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING) - - for model_class in self.all_model_classes: - model = model_class(config) - assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer) - if model_class in text_in_text_out_models: - x = model.get_output_embeddings() - assert isinstance(x, tf.keras.layers.Layer) - name = model.get_bias() - assert isinstance(name, dict) - for k, v in name.items(): - assert isinstance(v, tf.Variable) - elif model_class in speech_in_text_out_models: - x = model.get_output_embeddings() - assert isinstance(x, tf.keras.layers.Layer) - name = model.get_bias() - assert name is None - else: - x = model.get_output_embeddings() - assert x is None - name = model.get_bias() - assert name is None - - def test_determinism(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - first, second = ( - model( - self._prepare_for_class(inputs_dict, model_class), - training=False, - )[0], - model( - self._prepare_for_class(inputs_dict, model_class), - training=False, - )[0], - ) - out_1 = first.numpy() - out_2 = second.numpy() - out_1 = out_1[~np.isnan(out_1)] - out_2 = out_2[~np.isnan(out_2)] - max_diff = np.amax(np.abs(out_1 - out_2)) - self.assertLessEqual(max_diff, 1e-5) - - def test_model_outputs_equivalence(self): - - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): - tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs) - dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple() - - def recursive_check(tuple_object, dict_object): - if isinstance(tuple_object, (List, Tuple)): - for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): - recursive_check(tuple_iterable_value, dict_iterable_value) - elif tuple_object is None: - return - else: - self.assertTrue( - all(tf.equal(tuple_object, dict_object)), - msg=f"Tuple and dict output are not equal. Difference: {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}", - ) - - recursive_check(tuple_output, dict_output) - - for model_class in self.all_model_classes: - model = model_class(config) - - tuple_inputs = self._prepare_for_class(inputs_dict, model_class) - dict_inputs = self._prepare_for_class(inputs_dict, model_class) - check_equivalence(model, tuple_inputs, dict_inputs) - - tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - check_equivalence(model, tuple_inputs, dict_inputs) - - tuple_inputs = self._prepare_for_class(inputs_dict, model_class) - dict_inputs = self._prepare_for_class(inputs_dict, model_class) - check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) - - tuple_inputs = self._prepare_for_class(inputs_dict, model_class) - dict_inputs = self._prepare_for_class(inputs_dict, model_class) - check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) - - tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) - - tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) - - tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - check_equivalence( - model, - tuple_inputs, - dict_inputs, - {"output_hidden_states": True, "output_attentions": True}, - ) - def test_inputs_embeds(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - - inputs = copy.deepcopy(inputs_dict) - - if not self.is_encoder_decoder: - input_ids = inputs["input_ids"] - del inputs["input_ids"] - else: - encoder_input_ids = inputs["input_ids"] - decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids) - del inputs["input_ids"] - inputs.pop("decoder_input_ids", None) - - if not self.is_encoder_decoder: - inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids) - else: - inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids) - inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids) - - inputs = self._prepare_for_class(inputs, model_class) - - model(inputs) - - def test_numpy_arrays_inputs(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - def prepare_numpy_arrays(inputs_dict): - inputs_np_dict = {} - for k, v in inputs_dict.items(): - if tf.is_tensor(v): - inputs_np_dict[k] = v.numpy() - else: - inputs_np_dict[k] = np.array(k) - - return inputs_np_dict - - for model_class in self.all_model_classes: - model = model_class(config) - - inputs = self._prepare_for_class(inputs_dict, model_class) - inputs_np = prepare_numpy_arrays(inputs) - - output_for_dict_input = model(inputs_np) - output_for_kw_input = model(**inputs_np) - self.assert_outputs_same(output_for_dict_input, output_for_kw_input) - - def test_resize_token_embeddings(self): - if not self.test_resize_embeddings: - return - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - def _get_word_embedding_weight(model, embedding_layer): - embeds = getattr(embedding_layer, "weight", None) - if embeds is not None: - return embeds - - embeds = getattr(embedding_layer, "decoder", None) - if embeds is not None: - return embeds - - model(model.dummy_inputs) - - embeds = getattr(embedding_layer, "weight", None) - if embeds is not None: - return embeds - - embeds = getattr(embedding_layer, "decoder", None) - if embeds is not None: - return embeds - - return None - - for model_class in self.all_model_classes: - for size in [config.vocab_size - 10, config.vocab_size + 10, None]: - # build the embeddings - model = model_class(config=config) - old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings()) - old_bias = model.get_bias() - old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings()) - # reshape the embeddings - model.resize_token_embeddings(size) - new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings()) - new_bias = model.get_bias() - new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings()) - - # check that the resized embeddings size matches the desired size. - assert_size = size if size is not None else config.vocab_size - self.assertEqual(new_input_embeddings.shape[0], assert_size) - - # check that weights remain the same after resizing - models_equal = True - for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()): - if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: - models_equal = False - self.assertTrue(models_equal) - - if old_bias is not None and new_bias is not None: - for old_weight, new_weight in zip(old_bias.values(), new_bias.values()): - self.assertEqual(new_weight.shape[0], assert_size) - - models_equal = True - for p1, p2 in zip(old_weight.value(), new_weight.value()): - if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: - models_equal = False - self.assertTrue(models_equal) - - if old_output_embeddings is not None and new_output_embeddings is not None: - self.assertEqual(new_output_embeddings.shape[0], assert_size) - self.assertEqual( - new_output_embeddings.shape[1], - old_output_embeddings.shape[1], - ) - - models_equal = True - for p1, p2 in zip( - old_output_embeddings.value(), - new_output_embeddings.value(), - ): - if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: - models_equal = False - self.assertTrue(models_equal) - - def test_lm_head_model_random_no_beam_search_generate(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - input_ids = inputs_dict.get("input_ids", None) - - # iterate over all generative models - for model_class in self.all_generative_model_classes: - model = model_class(config) - - if config.bos_token_id is None: - # if bos token id is not defined model needs input_ids - with self.assertRaises(AssertionError): - model.generate(do_sample=True, max_length=5) - # num_return_sequences = 1 - self._check_generated_ids(model.generate(input_ids, do_sample=True)) - elif model_class.__name__ not in ["TFSpeech2TextForConditionalGeneration"]: - # Models with non-text inputs won't work here; num_return_sequences = 1 - self._check_generated_ids(model.generate(do_sample=True, max_length=5)) - - with self.assertRaises(ValueError): - # generating multiple sequences when no beam search generation - # is not allowed as it would always generate the same sequences - model.generate(input_ids, do_sample=False, num_return_sequences=2) - - # num_return_sequences > 1, sample - self._check_generated_ids(model.generate(input_ids, do_sample=True, num_return_sequences=2)) - - # check bad words tokens language generation - # create list of 1-seq bad token and list of 2-seq of bad tokens - bad_words_ids = [ - self._generate_random_bad_tokens(1, model), - self._generate_random_bad_tokens(2, model), - ] - output_tokens = model.generate( - input_ids, - do_sample=True, - bad_words_ids=bad_words_ids, - num_return_sequences=2, - ) - # only count generated tokens - generated_ids = output_tokens[:, input_ids.shape[-1] :] - self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) - - def test_lm_head_model_no_beam_search_generate_dict_outputs(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - input_ids = inputs_dict.get("input_ids", None) - if input_ids is None: - input_ids = inputs_dict.get("input_features", None) - - # iterate over all generative models - for model_class in self.all_generative_model_classes: - model = model_class(config) - output_greedy = model.generate( - input_ids, - do_sample=False, - output_scores=True, - output_hidden_states=True, - output_attentions=True, - return_dict_in_generate=True, - ) - output_sample = model.generate( - input_ids, - do_sample=True, - output_scores=True, - output_hidden_states=True, - output_attentions=True, - return_dict_in_generate=True, - ) + last_hidden_state = encoder_outputs[0] + pooled_output = self.layernorm(self.pooler(last_hidden_state)) - if model.config.is_encoder_decoder: - self.assertIsInstance(output_greedy, TFGreedySearchEncoderDecoderOutput) - self.assertIsInstance(output_sample, TFSampleEncoderDecoderOutput) - else: - self.assertIsInstance(output_greedy, TFGreedySearchDecoderOnlyOutput) - self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput) - - def test_lm_head_model_random_beam_search_generate(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - input_ids = inputs_dict.get("input_ids", None) - - for model_class in self.all_generative_model_classes: - model = model_class(config) - - if config.bos_token_id is None: - # if bos token id is not defined model needs input_ids, num_return_sequences = 1 - self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2)) - else: - # num_return_sequences = 1 - self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2)) - - with self.assertRaises(AssertionError): - # generating more sequences than having beams leads is not possible - model.generate( - input_ids, - do_sample=False, - num_return_sequences=3, - num_beams=2, - ) - - # num_return_sequences > 1, sample - self._check_generated_ids( - model.generate( - input_ids, - do_sample=True, - num_beams=2, - num_return_sequences=2, - ) - ) - # num_return_sequences > 1, greedy - self._check_generated_ids( - model.generate( - input_ids, - do_sample=False, - num_beams=2, - num_return_sequences=2, - ) - ) + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] - # check bad words tokens language generation - # create list of 1-seq bad token and list of 2-seq of bad tokens - bad_words_ids = [ - self._generate_random_bad_tokens(1, model), - self._generate_random_bad_tokens(2, model), - ] - output_tokens = model.generate( - input_ids, - do_sample=False, - bad_words_ids=bad_words_ids, - num_beams=2, - num_return_sequences=2, - ) - # only count generated tokens - generated_ids = output_tokens[:, input_ids.shape[-1] :] - self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) - - def test_lm_head_model_beam_search_generate_dict_outputs(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - input_ids = inputs_dict.get("input_ids", None) - if input_ids is None: - input_ids = inputs_dict.get("input_features", None) - - # iterate over all generative models - for model_class in self.all_generative_model_classes: - model = model_class(config) - output_beam_search = model.generate( - input_ids, - num_beams=2, - do_sample=False, - output_scores=True, - output_hidden_states=True, - output_attentions=True, - return_dict_in_generate=True, - ) - output_beam_sample = model.generate( - input_ids, - num_beams=2, - do_sample=True, - output_scores=True, - output_hidden_states=True, - output_attentions=True, - return_dict_in_generate=True, - ) + return TFBaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + ) - if model.config.is_encoder_decoder: - self.assertIsInstance(output_beam_search, TFBeamSearchEncoderDecoderOutput) - self.assertIsInstance(output_beam_sample, TFBeamSampleEncoderDecoderOutput) - else: - self.assertIsInstance(output_beam_search, TFBeamSearchDecoderOnlyOutput) - self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput) - - def test_loss_computation(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: - model = model_class(config) - if getattr(model, "hf_compute_loss", None): - # The number of elements in the loss should be the same as the number of elements in the label - prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) - added_label = prepared_for_class[ - sorted( - list(prepared_for_class.keys() - inputs_dict.keys()), - reverse=True, - )[0] - ] - loss_size = tf.size(added_label) - - if model.__class__ in get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING): - # if loss is causal lm loss, labels are shift, so that one label per batch - # is cut - loss_size = loss_size - self.model_tester.batch_size - - # Test that model correctly compute the loss with kwargs - prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) - possible_input_names = { - "input_ids", - "pixel_values", - "input_features", - } - input_name = possible_input_names.intersection(set(prepared_for_class)).pop() - model_input = prepared_for_class.pop(input_name) - - loss = model(model_input, **prepared_for_class)[0] - self.assertEqual(loss.shape, [loss_size]) - - # Test that model correctly compute the loss with a dict - prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) - loss = model(prepared_for_class)[0] - self.assertEqual(loss.shape, [loss_size]) - - # Test that model correctly compute the loss with a tuple - prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) - - # Get keys that were added with the _prepare_for_class function - label_keys = prepared_for_class.keys() - inputs_dict.keys() - signature = inspect.signature(model.call).parameters - signature_names = list(signature.keys()) - - # Create a dictionary holding the location of the tensors in the tuple - tuple_index_mapping = {0: input_name} - for label_key in label_keys: - label_key_index = signature_names.index(label_key) - tuple_index_mapping[label_key_index] = label_key - sorted_tuple_index_mapping = sorted(tuple_index_mapping.items()) - # Initialize a list with their default values, update the values and convert to a tuple - list_input = [] - - for name in signature_names: - if name != "kwargs": - list_input.append(signature[name].default) - - for index, value in sorted_tuple_index_mapping: - list_input[index] = prepared_for_class[value] - - tuple_input = tuple(list_input) - - # Send to model - loss = model(tuple_input[:-1])[0] - - self.assertEqual(loss.shape, [loss_size]) - - def test_generate_with_headmasking(self): - attention_names = [ - "encoder_attentions", - "decoder_attentions", - "cross_attentions", - ] - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_generative_model_classes: - model = model_class(config) - - # We want to test only encoder-decoder models - if not config.is_encoder_decoder: - continue - - head_masking = { - "head_mask": tf.zeros((config.encoder_layers, config.encoder_attention_heads)), - "decoder_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)), - "cross_attn_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)), - } - signature = inspect.signature(model.call) - if set(head_masking.keys()) < set([*signature.parameters.keys()]): - continue - - for attn_name, (name, mask) in zip(attention_names, head_masking.items()): - out = model.generate( - inputs_dict["input_ids"], - num_beams=1, - max_length=inputs_dict["input_ids"] + 5, - output_attentions=True, - return_dict_in_generate=True, - **{name: mask}, - ) - # We check the state of decoder_attentions and cross_attentions just from the last step - attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1] - self.assertEqual(sum([tf.reduce_sum(w).numpy() for w in attn_weights]), 0.0) - - def test_load_with_mismatched_shapes(self): - if not self.test_mismatched_shapes: - return - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): - continue - - with self.subTest(msg=f"Testing {model_class}"): - with tempfile.TemporaryDirectory() as tmp_dir: - model = model_class(config) - inputs = self._prepare_for_class(inputs_dict, model_class) - _ = model(**inputs) - model.save_pretrained(tmp_dir) - - # Fails when we don't set ignore_mismatched_sizes=True - with self.assertRaises(ValueError): - new_model = TFAutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42) - with self.assertRaises(ValueError): - new_model_without_prefix = TFAutoModel.from_pretrained(tmp_dir, vocab_size=10) - - logger = logging.get_logger("transformers.modeling_tf_utils") - with CaptureLogger(logger) as cl: - new_model = TFAutoModelForSequenceClassification.from_pretrained( - tmp_dir, num_labels=42, ignore_mismatched_sizes=True - ) - self.assertIn("the shapes did not match", cl.out) - - logits = new_model(**inputs).logits - self.assertEqual(logits.shape[1], 42) - - with CaptureLogger(logger) as cl: - new_model_without_prefix = TFAutoModel.from_pretrained( - tmp_dir, vocab_size=10, ignore_mismatched_sizes=True - ) - self.assertIn("the shapes did not match", cl.out) - - # Although Tf models always have a prefix pointing to `MainLayer`, - # we still add this "without prefix" test to keep a consistency between tf and pt tests. - input_ids = ids_tensor((2, 8), 10) - if self.is_encoder_decoder: - new_model_without_prefix(input_ids, decoder_input_ids=input_ids) - else: - new_model_without_prefix(input_ids) - - def test_model_main_input_name(self): - for model_class in self.all_model_classes: - model_signature = inspect.signature(getattr(model_class, "call")) - # The main input is the name of the argument after `self` - observed_main_input_name = list(model_signature.parameters.keys())[1] - self.assertEqual(model_class.main_input_name, observed_main_input_name) - - def _generate_random_bad_tokens(self, num_bad_tokens, model): - # special tokens cannot be bad tokens - special_tokens = [] - if model.config.bos_token_id is not None: - special_tokens.append(model.config.bos_token_id) - if model.config.pad_token_id is not None: - special_tokens.append(model.config.pad_token_id) - if model.config.eos_token_id is not None: - special_tokens.append(model.config.eos_token_id) - - # create random bad tokens that are not special tokens - bad_tokens = [] - while len(bad_tokens) < num_bad_tokens: - token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0] - if token not in special_tokens: - bad_tokens.append(token) - return bad_tokens - - def _check_generated_ids(self, output_ids): - for token_id in output_ids[0].numpy().tolist(): - self.assertGreaterEqual(token_id, 0) - self.assertLess(token_id, self.model_tester.vocab_size) - - def _check_match_tokens(self, generated_ids, bad_words_ids): - # for all bad word tokens - for bad_word_ids in bad_words_ids: - # for all slices in batch - for generated_ids_slice in generated_ids: - # for all word idx - for i in range(len(bad_word_ids), len(generated_ids_slice)): - # if tokens match - if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids: - return True - return False - - -def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): - """Creates a random int32 tensor of the shape within the vocab size.""" - if rng is None: - rng = random.Random() - - total_dims = 1 - for dim in shape: - total_dims *= dim - - values = [] - for _ in range(total_dims): - values.append(rng.randint(0, vocab_size - 1)) - - output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32) - - return output - - -def random_attention_mask(shape, rng=None, name=None, dtype=None): - attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype) - # make sure that at least one token is attended to for each batch - attn_mask = tf.concat( - [ - tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), - attn_mask[:, 1:], - ], - axis=1, +class TFConvNextPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = ConvNextConfig + base_model_prefix = "convnext" + main_input_name = "pixel_values" + + @property + def dummy_inputs(self) -> Dict[str, tf.Tensor]: + """ + Dummy inputs to build the network. + + Returns: + `Dict[str, tf.Tensor]`: The dummy inputs. + """ + VISION_DUMMY_INPUTS = tf.random.uniform( + shape=( + 3, + self.config.num_channels, + self.config.image_size, + self.config.image_size, + ), + dtype=tf.float32, + ) + return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)} + + @tf.function( + input_signature=[ + { + "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"), + } + ] ) - return attn_mask + def serving(self, inputs): + """ + Method used for serving the model. + Args: + inputs (`Dict[str, tf.Tensor]`): + The input of the saved model as a dictionary of tensors. + """ + return self.call(inputs) -def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None): - """Creates a random float32 tensor""" - if rng is None: - rng = random.Random() - total_dims = 1 - for dim in shape: - total_dims *= dim +CONVNEXT_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) - values = [] - for _ in range(total_dims): - values.append(rng.random() * scale) + This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. - return tf.reshape( - tf.constant(values, dtype=dtype if dtype is not None else tf.float32), - shape=shape, - ) + + TF 2.0 models accepts two formats as inputs: -@require_tf -class UtilsFunctionsTest(unittest.TestCase): - - # tests whether the top_k_top_p_filtering function behaves as expected - def test_top_k_top_p_filtering(self): - logits = tf.convert_to_tensor( - [ - [ - 8.2220991, # 3rd highest value; idx. 0 - -0.5620044, - 5.23229752, - 4.0386393, - -6.8798378, - -0.54785802, - -3.2012153, - 2.92777176, - 1.88171953, - 7.35341276, # 5th highest value; idx. 9 - 8.43207833, # 2nd highest value; idx. 10 - -9.85711836, - -5.96209236, - -1.13039161, - -7.1115294, - -0.8369633, - -5.3186408, - 7.06427407, - 0.81369344, - -0.82023817, - -5.9179796, - 0.58813443, - -6.99778438, - 4.71551189, - -0.18771637, - 7.44020759, # 4th highest value; idx. 25 - 9.38450987, # 1st highest value; idx. 26 - 2.12662941, - -9.32562038, - 2.35652522, - ], # cummulative prob of 5 highest values <= 0.6 - [ - 0.58425518, - 4.53139238, - -5.57510464, - -6.28030699, - -7.19529503, - -4.02122551, - 1.39337037, - -6.06707057, - 1.59480517, - -9.643119, - 0.03907799, - 0.67231762, - -8.88206726, - 6.27115922, # 4th highest value; idx. 13 - 2.28520723, - 4.82767506, - 4.30421368, - 8.8275313, # 2nd highest value; idx. 17 - 5.44029958, # 5th highest value; idx. 18 - -4.4735794, - 7.38579536, # 3rd highest value; idx. 20 - -2.91051663, - 2.61946077, - -2.5674762, - -9.48959302, - -4.02922645, - -1.35416918, - 9.67702323, # 1st highest value; idx. 27 - -5.89478553, - 1.85370467, - ], # cummulative prob of 5 highest values <= 0.6 - ], - dtype=tf.float32, + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional arguments. + + This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the + tensors in the first argument of the model call function: `model(inputs)`. + + + + Parameters: + config ([`ConvNextConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +CONVNEXT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`ConvNextFeatureExtractor`]. See + [`ConvNextFeatureExtractor.__call__`] for details. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used + in eager mode, in graph mode the value will always be set to True. +""" + + +@add_start_docstrings( + "The bare ConvNext model outputting raw features without any specific head on top.", + CONVNEXT_START_DOCSTRING, +) +class TFConvNextModel(TFConvNextPreTrainedModel): + def __init__(self, config, *inputs, add_pooling_layer=True, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.convnext = TFConvNextMainLayer(config, add_pooling_layer=add_pooling_layer, name="convnext") + + @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC) + def call( + self, + pixel_values: Optional[TFModelInputType] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + **kwargs, + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import ConvNextFeatureExtractor, TFConvNextModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") + >>> model = TFConvNextModel.from_pretrained("facebook/convnext-tiny-224") + + >>> inputs = feature_extractor(images=image, return_tensors="tf") + >>> outputs = model(**inputs) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, ) - non_inf_expected_idx = tf.convert_to_tensor( - [ - [0, 0], - [0, 9], - [0, 10], - [0, 25], - [0, 26], - [1, 13], - [1, 17], - [1, 18], - [1, 20], - [1, 27], - ], - dtype=tf.int32, - ) # expected non filtered idx as noted above - - non_inf_expected_output = tf.convert_to_tensor( - [ - 8.222099, - 7.3534126, - 8.432078, - 7.4402075, - 9.38451, - 6.271159, - 8.827531, - 5.4402995, - 7.3857956, - 9.677023, - ], - dtype=tf.float32, - ) # expected non filtered values as noted above + if "input_ids" in inputs: + inputs["pixel_values"] = inputs.pop("input_ids") - output = tf_top_k_top_p_filtering(logits, top_k=10, top_p=0.6, min_tokens_to_keep=4) + if inputs["pixel_values"] is None: + raise ValueError("You have to specify pixel_values") - non_inf_output = output[output != -float("inf")] - non_inf_idx = tf.cast( - tf.where(tf.not_equal(output, tf.constant(-float("inf"), dtype=tf.float32))), - dtype=tf.int32, + outputs = self.convnext( + pixel_values=inputs["pixel_values"], + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=inputs["training"], ) - tf.debugging.assert_near(non_inf_output, non_inf_expected_output, rtol=1e-12) - tf.debugging.assert_equal(non_inf_idx, non_inf_expected_idx) - - -@require_tf -@is_staging_test -class TFModelPushToHubTester(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls._token = login(username=USER, password=PASS) - - @classmethod - def tearDownClass(cls): - try: - delete_repo(token=cls._token, name="test-model-tf") - except HTTPError: - pass - - try: - delete_repo( - token=cls._token, - name="test-model-tf-org", - organization="valid_org", - ) - except HTTPError: - pass - - def test_push_to_hub(self): - config = BertConfig( - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, + # converts back NHWC -> NCHW, to match PT's output + if not return_dict: + return (tf.transpose(outputs[0], perm=(0, 3, 1, 2)),) + outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=tf.transpose(outputs.last_hidden_state, perm=(0, 3, 1, 2)), + pooler_output=outputs.pooler_output, + hidden_states=outputs.hidden_states, ) - model = TFBertModel(config) - # Make sure model is properly initialized - _ = model(model.dummy_inputs) - with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained( - os.path.join(tmp_dir, "test-model-tf"), - push_to_hub=True, - use_auth_token=self._token, - ) - new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf") - models_equal = True - for p1, p2 in zip(model.weights, new_model.weights): - if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: - models_equal = False - self.assertTrue(models_equal) - - def test_push_to_hub_with_model_card(self): - config = BertConfig( - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, + +@add_start_docstrings( + """ + ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for + ImageNet. + """, + CONVNEXT_START_DOCSTRING, +) +class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: ConvNextConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + self.convnext = TFConvNextMainLayer(config, name="convnext") + + # Classifier head + self.classifier = tf.keras.layers.Dense( + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + bias_initializer="zeros", + name="classifier", ) - model = TFBertModel(config) - with tempfile.TemporaryDirectory() as tmp_dir: - model.push_to_hub(os.path.join(tmp_dir, "test-model-tf")) - self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "test-model-card-tf", "README.md"))) - - def test_push_to_hub_in_organization(self): - config = BertConfig( - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, + + @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + pixel_values: Optional[TFModelInputType] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + + Examples: + + ```python + >>> from transformers import ConvNextFeatureExtractor, TFConvNextForImageClassification + >>> import tensorflow as tf + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") + >>> model = TFViTForImageClassification.from_pretrained("facebook/convnext-tiny-224") + + >>> inputs = feature_extractor(images=image, return_tensors="tf") + >>> outputs = model(**inputs) + >>> logits = outputs.logits + >>> # model predicts one of the 1000 ImageNet classes + >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0] + >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)]) + ```""" + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + training=training, + kwargs_call=kwargs, + ) + + if "input_ids" in inputs: + inputs["pixel_values"] = inputs.pop("input_ids") + + if inputs["pixel_values"] is None: + raise ValueError("You have to specify pixel_values") + + outputs = self.convnext( + inputs["pixel_values"], + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=inputs["training"], ) - model = TFBertModel(config) - with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained( - os.path.join(tmp_dir, "test-model-tf-org"), - push_to_hub=True, - use_auth_token=self._token, - organization="valid_org", - ) - new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org") - models_equal = True - for p1, p2 in zip(model.weights, new_model.weights): - if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: - models_equal = False - self.assertTrue(models_equal) + pooled_output = outputs.pooler_output if return_dict else outputs[1] + + logits = self.classifier(pooled_output) + loss = None if inputs["labels"] is None else self.hf_compute_loss(labels=inputs["labels"], logits=logits) + + if not inputs["return_dict"]: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + ) diff --git a/tests/convnext/test_modeling_tf_convnext.py b/tests/convnext/test_modeling_tf_convnext.py index 6f8c142b654d8..cfc2646176448 100644 --- a/tests/convnext/test_modeling_tf_convnext.py +++ b/tests/convnext/test_modeling_tf_convnext.py @@ -22,8 +22,8 @@ from transformers.file_utils import cached_property, is_tf_available, is_vision_available from transformers.testing_utils import require_tf, require_vision, slow -from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor +from ..test_configuration_common import ConfigTester +from ..test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor if is_tf_available(): From 3e069429bf6bb0a5e6a54de2fb5e2443b82f5b89 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 24 Feb 2022 13:41:36 +0530 Subject: [PATCH 52/65] chore: applied sgugger's suggestion for dealing w/ output_attentions. --- src/transformers/modeling_tf_utils.py | 122 +++++++++++++++++++++----- 1 file changed, 98 insertions(+), 24 deletions(-) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index f85f3aaa8e028..d2e5a5ba9ca9d 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -312,7 +312,7 @@ def booleans_processing(config, **kwargs): if tf.executing_eagerly(): final_booleans["output_attentions"] = kwargs.get("output_attentions", None) - if not final_booleans["output_attentions"]: + if final_booleans["output_attentions"] is None: final_booleans["output_attentions"] = config.output_attentions final_booleans["output_hidden_states"] = ( @@ -366,7 +366,17 @@ def input_processing(func, config, input_ids, **kwargs): signature.pop("self", None) parameter_names = list(signature.keys()) output = {} - allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray, KerasTensor) + allowed_types = ( + tf.Tensor, + bool, + int, + ModelOutput, + tuple, + list, + dict, + np.ndarray, + KerasTensor, + ) if "inputs" in kwargs["kwargs_call"]: warnings.warn( @@ -479,7 +489,13 @@ def input_processing(func, config, input_ids, **kwargs): boolean_dict = { k: v for k, v in output.items() - if k in ["return_dict", "output_attentions", "output_hidden_states", "use_cache"] + if k + in [ + "return_dict", + "output_attentions", + "output_hidden_states", + "use_cache", + ] } output.update( @@ -578,11 +594,18 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False, # If yes we reshape the weight from the H5 file accordingly to the current weight # If the two shapes are not compatible we raise an issue try: - array = np.reshape(saved_weight_value, K.int_shape(symbolic_weight)) + array = np.reshape( + saved_weight_value, + K.int_shape(symbolic_weight), + ) except ValueError as e: if ignore_mismatched_sizes: mismatched_layers.append( - (symbolic_weight_name, saved_weight_value.shape, K.int_shape(symbolic_weight)) + ( + symbolic_weight_name, + saved_weight_value.shape, + K.int_shape(symbolic_weight), + ) ) continue else: @@ -626,11 +649,17 @@ def init_copy_embeddings(old_embeddings, new_num_tokens): # and we create a mask to properly identify the padded values and be replaced by the values of the newly created # embeddings current_weights = tf.pad( - old_embeddings.value(), tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=-1 + old_embeddings.value(), + tf.convert_to_tensor([[0, size_diff], [0, 0]]), + constant_values=-1, ) num_tokens_to_copy = min(old_num_tokens, new_num_tokens) mask = tf.fill(tf.convert_to_tensor([num_tokens_to_copy, 1]), True) - mask = tf.pad(mask, tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=False) + mask = tf.pad( + mask, + tf.convert_to_tensor([[0, size_diff], [0, 0]]), + constant_values=False, + ) else: # if the new size if lower than the old one, we take the current embeddings until the new size current_weights = tf.slice( @@ -775,7 +804,10 @@ def _save_checkpoint(self, checkpoint_dir, epoch): # internally and which users are likely to use too weights_path = os.path.join(checkpoint_dir, "weights.h5") self.save_weights(weights_path) - extra_data = {"epoch": epoch, "optimizer_state": self.optimizer.get_weights()} + extra_data = { + "epoch": epoch, + "optimizer_state": self.optimizer.get_weights(), + } extra_data_path = os.path.join(checkpoint_dir, "extra_data.pickle") with open(extra_data_path, "wb") as f: pickle.dump(extra_data, f) @@ -801,7 +833,10 @@ def load_repo_checkpoint(self, repo_path_or_name): if not os.path.isdir(repo_path_or_name): # If this isn't a local path, check that the remote repo exists and has a checkpoint in it repo_files = list_repo_files(repo_path_or_name) - for file in ("checkpoint/weights.h5", "checkpoint/extra_data.pickle"): + for file in ( + "checkpoint/weights.h5", + "checkpoint/extra_data.pickle", + ): if file not in repo_files: raise FileNotFoundError(f"Repo {repo_path_or_name} does not contain checkpoint file {file}!") if "/" not in repo_path_or_name: @@ -809,7 +844,10 @@ def load_repo_checkpoint(self, repo_path_or_name): repo_path_or_name = self.get_full_repo_name(repo_path_or_name) else: model_id = repo_path_or_name.split("/")[-1] - repo = Repository(model_id, clone_from=f"https://huggingface.co/{repo_path_or_name}") + repo = Repository( + model_id, + clone_from=f"https://huggingface.co/{repo_path_or_name}", + ) local_dir = repo.local_dir else: local_dir = repo_path_or_name @@ -1066,7 +1104,8 @@ def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]: `tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model. """ warnings.warn( - "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning + "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", + FutureWarning, ) return self.get_lm_head() @@ -1077,7 +1116,10 @@ def get_prefix_bias_name(self) -> Union[None, str]: Return: `str`: The _prefix name of the bias. """ - warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + warnings.warn( + "The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", + FutureWarning, + ) return None def get_bias(self) -> Union[None, Dict[str, tf.Variable]]: @@ -1225,15 +1267,25 @@ def _get_resized_lm_head_bias(self, old_lm_head_bias, new_num_tokens): # initialize new bias if tf.math.greater(size_diff, 0): padding_shape = [[0, size_diff]] if first_dim is None else [[0, 0], [0, size_diff]] - current_bias = tf.pad(weight.value(), tf.convert_to_tensor(padding_shape), constant_values=-1) + current_bias = tf.pad( + weight.value(), + tf.convert_to_tensor(padding_shape), + constant_values=-1, + ) num_tokens_to_copy = min(old_num_tokens, new_num_tokens) mask_shape = [num_tokens_to_copy] if first_dim is None else [1, num_tokens_to_copy] bias_mask = tf.fill(tf.convert_to_tensor(mask_shape), True) - bias_mask = tf.pad(bias_mask, tf.convert_to_tensor(padding_shape), constant_values=False) + bias_mask = tf.pad( + bias_mask, + tf.convert_to_tensor(padding_shape), + constant_values=False, + ) else: slice_from = [0] if first_dim is None else [0, 0] current_bias = tf.slice( - weight.value(), tf.convert_to_tensor(slice_from), tf.convert_to_tensor(final_shape) + weight.value(), + tf.convert_to_tensor(slice_from), + tf.convert_to_tensor(final_shape), ) bias_mask = tf.fill(tf.convert_to_tensor(final_shape), True) @@ -1374,7 +1426,11 @@ def save_pretrained(self, save_directory, saved_model=False, version=1, push_to_ if saved_model: saved_model_dir = os.path.join(save_directory, "saved_model", str(version)) - self.save(saved_model_dir, include_optimizer=False, signatures=self.serving) + self.save( + saved_model_dir, + include_optimizer=False, + signatures=self.serving, + ) logger.info(f"Saved model created in {saved_model_dir}") # Save configuration file @@ -1526,7 +1582,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): from_pipeline = kwargs.pop("_from_pipeline", None) from_auto_class = kwargs.pop("_from_auto", False) - user_agent = {"file_type": "model", "framework": "tensorflow", "from_auto_class": from_auto_class} + user_agent = { + "file_type": "model", + "framework": "tensorflow", + "from_auto_class": from_auto_class, + } if from_pipeline is not None: user_agent["using_pipeline"] = from_pipeline @@ -1622,7 +1682,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): "proxies": proxies, "use_auth_token": use_auth_token, } - if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs): + if has_file( + pretrained_model_name_or_path, + WEIGHTS_NAME, + **has_file_kwargs, + ): raise EnvironmentError( f"{pretrained_model_name_or_path} does not appear to have a file named {TF2_WEIGHTS_NAME} " "but there is a file for PyTorch weights. Use `from_pt=True` to load this model from " @@ -1772,7 +1836,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): # To update the docstring, we need to copy the method, otherwise we change the original docstring. TFPreTrainedModel.push_to_hub = copy_func(TFPreTrainedModel.push_to_hub) TFPreTrainedModel.push_to_hub.__doc__ = TFPreTrainedModel.push_to_hub.__doc__.format( - object="model", object_class="TFAutoModel", object_files="model checkpoint" + object="model", + object_class="TFAutoModel", + object_files="model checkpoint", ) @@ -1801,7 +1867,9 @@ def __init__(self, nf, nx, initializer_range=0.02, **kwargs): def build(self, input_shape): self.weight = self.add_weight( - "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range) + "weight", + shape=[self.nx, self.nf], + initializer=get_initializer(self.initializer_range), ) self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer()) @@ -1839,7 +1907,7 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optiona super().__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size - self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range + self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range def build(self, input_shape): """ @@ -1847,7 +1915,9 @@ def build(self, input_shape): https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ self.weight = self.add_weight( - "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range) + "weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) @@ -1961,7 +2031,9 @@ def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, ** else: num_classes = config.hidden_size self.summary = tf.keras.layers.Dense( - num_classes, kernel_initializer=get_initializer(initializer_range), name="summary" + num_classes, + kernel_initializer=get_initializer(initializer_range), + name="summary", ) self.has_activation = False @@ -2056,7 +2128,9 @@ def register_for_auto_class(cls, auto_class="TFAutoModel"): cls._auto_class = auto_class -def get_initializer(initializer_range: float = 0.02) -> tf.initializers.TruncatedNormal: +def get_initializer( + initializer_range: float = 0.02, +) -> tf.initializers.TruncatedNormal: """ Creates a `tf.initializers.TruncatedNormal` with the given range. From bc46016955eb70a6e2f6ea31e938f66e738b60af Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 24 Feb 2022 13:43:19 +0530 Subject: [PATCH 53/65] chore: added comments. --- src/transformers/modeling_tf_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index d2e5a5ba9ca9d..9d392ec6e4ff0 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -311,6 +311,7 @@ def booleans_processing(config, **kwargs): final_booleans = {} if tf.executing_eagerly(): + # Pure conv models (such as ConvNext) do not have `output_attentions` final_booleans["output_attentions"] = kwargs.get("output_attentions", None) if final_booleans["output_attentions"] is None: final_booleans["output_attentions"] = config.output_attentions From 06e19cd3d9d9b0533b113cfb4dca59330c0d1a89 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 24 Feb 2022 15:07:57 +0530 Subject: [PATCH 54/65] chore: applied updated quality enviornment style. --- src/transformers/modeling_tf_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 9d392ec6e4ff0..4637130e7771c 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -1908,7 +1908,7 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optiona super().__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size - self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range + self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range def build(self, input_shape): """ From 229a817ad8d9eb004fa11f548c483ad26a3a4283 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 24 Feb 2022 15:10:24 +0530 Subject: [PATCH 55/65] chore: applied formatting with quality enviornment. --- tests/test_modeling_common.py | 3066 ++++++++++++++------------------- 1 file changed, 1298 insertions(+), 1768 deletions(-) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 17888bcfac380..bf707b762c394 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -13,386 +13,196 @@ # See the License for the specific language governing permissions and # limitations under the License. + import copy -import gc import inspect import json import os -import os.path import random -import sys import tempfile import unittest -import warnings -from pathlib import Path -from typing import Dict, List, Tuple - -import numpy as np +from importlib import import_module +from typing import List, Tuple -import transformers -from huggingface_hub import Repository, delete_repo, login +from huggingface_hub import delete_repo, login from requests.exceptions import HTTPError -from transformers import ( - AutoConfig, - AutoModel, - AutoModelForSequenceClassification, - PretrainedConfig, - is_torch_available, - logging, -) -from transformers.file_utils import WEIGHTS_NAME, is_flax_available, is_torch_fx_available +from transformers import is_tf_available from transformers.models.auto import get_values +from transformers.testing_utils import tooslow # noqa: F401 from transformers.testing_utils import ( PASS, USER, CaptureLogger, - TestCasePlus, - is_pt_flax_cross_test, + _tf_gpu_memory_limit, is_pt_tf_cross_test, is_staging_test, - require_torch, - require_torch_multi_gpu, + require_tf, + require_tf2onnx, slow, - torch_device, ) +from transformers.utils import logging -sys.path.append(str(Path(__file__).parent.parent / "utils")) - -from test_module.custom_configuration import CustomConfig, NoSuperInitConfig # noqa E402 +if is_tf_available(): + import numpy as np + import tensorflow as tf - -if is_torch_available(): - import torch - from torch import nn - - from test_module.custom_modeling import CustomModel, NoSuperInitModel from transformers import ( - BERT_PRETRAINED_MODEL_ARCHIVE_LIST, - MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING, - MODEL_FOR_CAUSAL_LM_MAPPING, - MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, - MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING, - MODEL_FOR_MASKED_LM_MAPPING, - MODEL_FOR_MULTIPLE_CHOICE_MAPPING, - MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, - MODEL_FOR_QUESTION_ANSWERING_MAPPING, - MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, - MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, - MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, - MODEL_MAPPING, - AdaptiveEmbedding, + TF_MODEL_FOR_CAUSAL_LM_MAPPING, + TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, + TF_MODEL_FOR_MASKED_LM_MAPPING, + TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING, + TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, + TF_MODEL_FOR_PRETRAINING_MAPPING, + TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, + TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, + TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, + TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, BertConfig, - BertModel, - PreTrainedModel, - T5Config, - T5ForConditionalGeneration, + TFAutoModel, + TFAutoModelForSequenceClassification, + TFBertModel, + TFSharedEmbeddings, + tf_top_k_top_p_filtering, ) - -if is_flax_available(): - import jax.numpy as jnp - from transformers.modeling_flax_pytorch_utils import ( - convert_pytorch_state_dict_to_flax, - load_flax_weights_in_pytorch_model, + from transformers.generation_tf_utils import ( + TFBeamSampleDecoderOnlyOutput, + TFBeamSampleEncoderDecoderOutput, + TFBeamSearchDecoderOnlyOutput, + TFBeamSearchEncoderDecoderOutput, + TFGreedySearchDecoderOnlyOutput, + TFGreedySearchEncoderDecoderOutput, + TFSampleDecoderOnlyOutput, + TFSampleEncoderDecoderOutput, ) -if is_torch_fx_available(): - from transformers.utils.fx import symbolic_trace + if _tf_gpu_memory_limit is not None: + gpus = tf.config.list_physical_devices("GPU") + for gpu in gpus: + # Restrict TensorFlow to only allocate x GB of memory on the GPUs + try: + tf.config.set_logical_device_configuration( + gpu, + [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)], + ) + logical_gpus = tf.config.list_logical_devices("GPU") + print("Logical GPUs", logical_gpus) + except RuntimeError as e: + # Virtual devices must be set before GPUs have been initialized + print(e) def _config_zero_init(config): configs_no_init = copy.deepcopy(config) for key in configs_no_init.__dict__.keys(): - if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key: - setattr(configs_no_init, key, 1e-10) + if "_range" in key or "_std" in key: + setattr(configs_no_init, key, 0.0) return configs_no_init -TINY_T5 = "patrickvonplaten/t5-tiny-random" - - -@require_torch -class ModelTesterMixin: +@require_tf +class TFModelTesterMixin: model_tester = None all_model_classes = () all_generative_model_classes = () - fx_compatible = False - test_torchscript = True - test_pruning = True + test_mismatched_shapes = True test_resize_embeddings = True - test_resize_position_embeddings = False test_head_masking = True - test_mismatched_shapes = True - test_missing_keys = True - test_model_parallel = False is_encoder_decoder = False - def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict: inputs_dict = copy.deepcopy(inputs_dict) - if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING): + + if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): inputs_dict = { - k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous() - if isinstance(v, torch.Tensor) and v.ndim > 1 + k: tf.tile( + tf.expand_dims(v, 1), + (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1), + ) + if isinstance(v, tf.Tensor) and v.ndim > 0 else v for k, v in inputs_dict.items() } if return_labels: - if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING): - inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device) - elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING): - inputs_dict["start_positions"] = torch.zeros( - self.model_tester.batch_size, dtype=torch.long, device=torch_device - ) - inputs_dict["end_positions"] = torch.zeros( - self.model_tester.batch_size, dtype=torch.long, device=torch_device - ) + if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): + inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32) + elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING): + inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) + inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) elif model_class in [ - *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING), - *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING), - *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING), + *get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING), + *get_values(TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING), ]: - inputs_dict["labels"] = torch.zeros( - self.model_tester.batch_size, dtype=torch.long, device=torch_device - ) + inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) + elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING): + inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) elif model_class in [ - *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING), - *get_values(MODEL_FOR_CAUSAL_LM_MAPPING), - *get_values(MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING), - *get_values(MODEL_FOR_MASKED_LM_MAPPING), - *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING), + *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING), + *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING), + *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING), + *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING), + *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING), + *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING), ]: - inputs_dict["labels"] = torch.zeros( - (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device - ) - elif model_class in get_values(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING): - num_patches = self.model_tester.image_size // self.model_tester.patch_size - inputs_dict["bool_masked_pos"] = torch.zeros( - (self.model_tester.batch_size, num_patches**2), dtype=torch.long, device=torch_device + inputs_dict["labels"] = tf.zeros( + ( + self.model_tester.batch_size, + self.model_tester.seq_length, + ), + dtype=tf.int32, ) return inputs_dict + def test_initialization(self): + pass + def test_save_load(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) - model.to(torch_device) - model.eval() - with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - - out_2 = outputs[0].cpu().numpy() - out_2[np.isnan(out_2)] = 0 + outputs = model(self._prepare_for_class(inputs_dict, model_class)) with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) + model.save_pretrained(tmpdirname, saved_model=False) model = model_class.from_pretrained(tmpdirname) - model.to(torch_device) - with torch.no_grad(): - after_outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - - # Make sure we don't have nans - out_1 = after_outputs[0].cpu().numpy() - out_1[np.isnan(out_1)] = 0 - max_diff = np.amax(np.abs(out_1 - out_2)) - self.assertLessEqual(max_diff, 1e-5) - - def test_save_load_keys_to_ignore_on_save(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - _keys_to_ignore_on_save = getattr(model, "_keys_to_ignore_on_save", None) - if _keys_to_ignore_on_save is None: - continue + after_outputs = model(self._prepare_for_class(inputs_dict, model_class)) - # check the keys are in the original state_dict - for k in _keys_to_ignore_on_save: - self.assertIn(k, model.state_dict().keys(), "\n".join(model.state_dict().keys())) - - # check that certain keys didn't get saved with the model - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - output_model_file = os.path.join(tmpdirname, WEIGHTS_NAME) - state_dict_saved = torch.load(output_model_file) - for k in _keys_to_ignore_on_save: - self.assertNotIn(k, state_dict_saved.keys(), "\n".join(state_dict_saved.keys())) - - # Test we can load the state dict in the model, necessary for the checkpointing API in Trainer. - load_result = model.load_state_dict(state_dict_saved, strict=False) - self.assertTrue( - len(load_result.missing_keys) == 0 - or set(load_result.missing_keys) == set(model._keys_to_ignore_on_save) - ) - self.assertTrue(len(load_result.unexpected_keys) == 0) - - def test_gradient_checkpointing_backward_compatibility(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - if not model_class.supports_gradient_checkpointing: - continue - - config.gradient_checkpointing = True - model = model_class(config) - self.assertTrue(model.is_gradient_checkpointing) - - def test_gradient_checkpointing_enable_disable(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - if not model_class.supports_gradient_checkpointing: - continue - - # at init model should have gradient checkpointing disabled - model = model_class(config) - self.assertFalse(model.is_gradient_checkpointing) - - # check enable works - model.gradient_checkpointing_enable() - self.assertTrue(model.is_gradient_checkpointing) - - # check disable works - model.gradient_checkpointing_disable() - self.assertFalse(model.is_gradient_checkpointing) - - def _mock_init_weights(self, module): - if hasattr(module, "weight") and module.weight is not None: - module.weight.data.fill_(3) - if hasattr(module, "bias") and module.bias is not None: - module.bias.data.fill_(3) - - def test_save_load_fast_init_from_base(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - base_class = MODEL_MAPPING[config.__class__] - - if isinstance(base_class, tuple): - base_class = base_class[0] - - for model_class in self.all_model_classes: - if model_class == base_class: - continue - - # make a copy of model class to not break future tests - # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class - class CopyClass(model_class): - pass - - model_class_copy = CopyClass - - # make sure that all keys are expected for test - model_class_copy._keys_to_ignore_on_load_missing = [] - - # make init deterministic, but make sure that - # non-initialized weights throw errors nevertheless - model_class_copy._init_weights = self._mock_init_weights - - model = base_class(config) - state_dict = model.state_dict() - - # this will often delete a single weight of a multi-weight module - # to test an edge case - random_key_to_del = random.choice(list(state_dict.keys())) - del state_dict[random_key_to_del] - - # check that certain keys didn't get saved with the model - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin")) + self.assert_outputs_same(after_outputs, outputs) - model_fast_init = model_class_copy.from_pretrained(tmpdirname) - model_slow_init = model_class_copy.from_pretrained(tmpdirname, _fast_init=False) - - for key in model_fast_init.state_dict().keys(): - max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item() - self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical") - - def test_save_load_fast_init_to_base(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - base_class = MODEL_MAPPING[config.__class__] - - if isinstance(base_class, tuple): - base_class = base_class[0] - - for model_class in self.all_model_classes: - - if model_class == base_class: - continue - - # make a copy of model class to not break future tests - # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class - class CopyClass(base_class): - pass - - base_class_copy = CopyClass - - # make sure that all keys are expected for test - base_class_copy._keys_to_ignore_on_load_missing = [] - - # make init deterministic, but make sure that - # non-initialized weights throw errors nevertheless - base_class_copy._init_weights = self._mock_init_weights - - model = model_class(config) - state_dict = model.state_dict() - - # this will often delete a single weight of a multi-weight module - # to test an edge case - random_key_to_del = random.choice(list(state_dict.keys())) - del state_dict[random_key_to_del] - - # check that certain keys didn't get saved with the model - with tempfile.TemporaryDirectory() as tmpdirname: - model.config.save_pretrained(tmpdirname) - torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin")) - - model_fast_init = base_class_copy.from_pretrained(tmpdirname) - model_slow_init = base_class_copy.from_pretrained(tmpdirname, _fast_init=False) - - for key in model_fast_init.state_dict().keys(): - max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item() - self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical") - - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - - def test_determinism(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + def test_save_load_config(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) - model.to(torch_device) - model.eval() - with torch.no_grad(): - first = model(**self._prepare_for_class(inputs_dict, model_class))[0] - second = model(**self._prepare_for_class(inputs_dict, model_class))[0] - - out_1 = first.cpu().numpy() - out_2 = second.cpu().numpy() - out_1 = out_1[~np.isnan(out_1)] - out_2 = out_2[~np.isnan(out_2)] - max_diff = np.amax(np.abs(out_1 - out_2)) - self.assertLessEqual(max_diff, 1e-5) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + model_config = model.get_config() + # make sure that returned config is jsonifiable, which is required by keras + json.dumps(model_config) + new_model = model_class.from_config(model.get_config()) + # make sure it also accepts a normal config + _ = model_class.from_config(model.config) + _ = new_model(self._prepare_for_class(inputs_dict, model_class)) # Build model + new_model.set_weights(model.get_weights()) + after_outputs = new_model(self._prepare_for_class(inputs_dict, model_class)) + + self.assert_outputs_same(after_outputs, outputs) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) - signature = inspect.signature(model.forward) + signature = inspect.signature(model.call) # signature.parameters is an OrderedDict => so arg_names order is deterministic arg_names = [*signature.parameters.keys()] @@ -404,1010 +214,707 @@ def test_forward_signature(self): "decoder_attention_mask", ] expected_arg_names.extend( - ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"] - if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names + ["head_mask", "decoder_head_mask"] if "head_mask" and "decoder_head_mask" in arg_names else [] + ) + # Necessary to handle BART with newly added cross_attn_head_mask + expected_arg_names.extend( + ["cross_attn_head_mask", "encoder_outputs"] + if "cross_attn_head_mask" in arg_names else ["encoder_outputs"] ) self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + else: expected_arg_names = ["input_ids"] self.assertListEqual(arg_names[:1], expected_arg_names) - def test_training(self): - if not self.model_tester.is_training: + def test_onnx_compliancy(self): + if not self.test_onnx: return - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - config.return_dict = True + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + INTERNAL_OPS = [ + "Assert", + "AssignVariableOp", + "EmptyTensorList", + "ReadVariableOp", + "ResourceGather", + "TruncatedNormal", + "VarHandleOp", + "VarIsInitializedOp", + ] + onnx_ops = [] - if model_class in get_values(MODEL_MAPPING): - continue + with open(os.path.join(".", "utils", "tf_ops", "onnx.json")) as f: + onnx_opsets = json.load(f)["opsets"] - model = model_class(config) - model.to(torch_device) - model.train() - inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - loss = model(**inputs).loss - loss.backward() - - def test_training_gradient_checkpointing(self): - if not self.model_tester.is_training: - return + for i in range(1, self.onnx_min_opset + 1): + onnx_ops.extend(onnx_opsets[str(i)]) for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - config.use_cache = False - config.return_dict = True + model_op_names = set() - if model_class in get_values(MODEL_MAPPING) or not model_class.supports_gradient_checkpointing: - continue - model = model_class(config) - model.to(torch_device) - model.gradient_checkpointing_enable() - model.train() - inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - loss = model(**inputs).loss - loss.backward() + with tf.Graph().as_default() as g: + model = model_class(config) + model(model.dummy_inputs) - def test_attention_outputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - config.return_dict = True + for op in g.get_operations(): + model_op_names.add(op.node_def.op) - seq_len = getattr(self.model_tester, "seq_length", None) - decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) - encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) - decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length) - encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) - chunk_length = getattr(self.model_tester, "chunk_length", None) - if chunk_length is not None and hasattr(self.model_tester, "num_hashes"): - encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes + model_op_names = sorted(model_op_names) + incompatible_ops = [] - for model_class in self.all_model_classes: - inputs_dict["output_attentions"] = True - inputs_dict["output_hidden_states"] = False - config.return_dict = True - model = model_class(config) - model.to(torch_device) - model.eval() - with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions - self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + for op in model_op_names: + if op not in onnx_ops and op not in INTERNAL_OPS: + incompatible_ops.append(op) - # check that output_attentions also work using config - del inputs_dict["output_attentions"] - config.output_attentions = True - model = model_class(config) - model.to(torch_device) - model.eval() - with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions - self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + self.assertEqual(len(incompatible_ops), 0, incompatible_ops) - if chunk_length is not None: - self.assertListEqual( - list(attentions[0].shape[-4:]), - [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length], - ) - else: - self.assertListEqual( - list(attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], - ) - out_len = len(outputs) + @require_tf2onnx + @slow + def test_onnx_runtime_optimize(self): + if not self.test_onnx: + return - if self.is_encoder_decoder: - correct_outlen = 5 - - # loss is at first position - if "labels" in inputs_dict: - correct_outlen += 1 # loss is added to beginning - # Question Answering model returns start_logits and end_logits - if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING): - correct_outlen += 1 # start_logits and end_logits instead of only 1 output - if "past_key_values" in outputs: - correct_outlen += 1 # past_key_values have been returned - - self.assertEqual(out_len, correct_outlen) - - # decoder attentions - decoder_attentions = outputs.decoder_attentions - self.assertIsInstance(decoder_attentions, (list, tuple)) - self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) - self.assertListEqual( - list(decoder_attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], - ) + import onnxruntime + import tf2onnx - # cross attentions - cross_attentions = outputs.cross_attentions - self.assertIsInstance(cross_attentions, (list, tuple)) - self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) - self.assertListEqual( - list(cross_attentions[0].shape[-3:]), - [ - self.model_tester.num_attention_heads, - decoder_seq_length, - encoder_key_length, - ], - ) + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() - # Check attention is always last and order is fine - inputs_dict["output_attentions"] = True - inputs_dict["output_hidden_states"] = True + for model_class in self.all_model_classes: model = model_class(config) - model.to(torch_device) - model.eval() - with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + model(model.dummy_inputs) - if hasattr(self.model_tester, "num_hidden_states_types"): - added_hidden_states = self.model_tester.num_hidden_states_types - elif self.is_encoder_decoder: - added_hidden_states = 2 - else: - added_hidden_states = 1 - self.assertEqual(out_len + added_hidden_states, len(outputs)) + onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset) - self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + onnxruntime.InferenceSession(onnx_model_proto.SerializeToString()) - self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) - if chunk_length is not None: - self.assertListEqual( - list(self_attentions[0].shape[-4:]), - [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length], - ) + def test_keras_save_load(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + tf_main_layer_classes = set( + module_member + for model_class in self.all_model_classes + for module in (import_module(model_class.__module__),) + for module_member_name in dir(module) + if module_member_name.endswith("MainLayer") + # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`. + and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] + for module_member in (getattr(module, module_member_name),) + if isinstance(module_member, type) + and tf.keras.layers.Layer in module_member.__bases__ + and getattr(module_member, "_keras_serializable", False) + ) + for main_layer_class in tf_main_layer_classes: + # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter + if "T5" in main_layer_class.__name__: + # Take the same values than in TFT5ModelTester for this shared layer + shared = TFSharedEmbeddings(99, 32, name="shared") + config.use_cache = inputs_dict.pop("use_cache", None) + main_layer = main_layer_class(config, embed_tokens=shared) else: - self.assertListEqual( - list(self_attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], - ) + main_layer = main_layer_class(config) - @slow - def test_torchscript(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - self._create_and_check_torchscript(config, inputs_dict) + symbolic_inputs = { + name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() + } - @slow - def test_torchscript_output_attentions(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - config.output_attentions = True - self._create_and_check_torchscript(config, inputs_dict) + model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) + outputs = model(inputs_dict) - @slow - def test_torchscript_output_hidden_state(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - config.output_hidden_states = True - self._create_and_check_torchscript(config, inputs_dict) + with tempfile.TemporaryDirectory() as tmpdirname: + filepath = os.path.join(tmpdirname, "keras_model.h5") + model.save(filepath) + if "T5" in main_layer_class.__name__: + model = tf.keras.models.load_model( + filepath, + custom_objects={ + main_layer_class.__name__: main_layer_class, + "TFSharedEmbeddings": TFSharedEmbeddings, + }, + ) + else: + model = tf.keras.models.load_model( + filepath, + custom_objects={main_layer_class.__name__: main_layer_class}, + ) + assert isinstance(model, tf.keras.Model) + after_outputs = model(inputs_dict) + self.assert_outputs_same(after_outputs, outputs) + + def assert_outputs_same(self, after_outputs, outputs): + # Make sure we don't have nans + if isinstance(after_outputs, tf.Tensor): + out_1 = after_outputs.numpy() + elif isinstance(after_outputs, dict): + out_1 = after_outputs[list(after_outputs.keys())[0]].numpy() + else: + out_1 = after_outputs[0].numpy() + out_2 = outputs[0].numpy() + self.assertEqual(out_1.shape, out_2.shape) + out_1 = out_1[~np.isnan(out_1)] + out_2 = out_2[~np.isnan(out_2)] + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) - def _create_and_check_torchscript(self, config, inputs_dict): - if not self.test_torchscript: - return + @is_pt_tf_cross_test + def test_pt_tf_model_equivalence(self): + import torch + + import transformers + + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() - configs_no_init = _config_zero_init(config) # To be sure we have no Nan - configs_no_init.torchscript = True for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - model.to(torch_device) - model.eval() - inputs = self._prepare_for_class(inputs_dict, model_class) + pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beginning + pt_model_class = getattr(transformers, pt_model_class_name) - try: - if model.config.is_encoder_decoder: - model.config.use_cache = False # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward - input_ids = inputs["input_ids"] - attention_mask = inputs["attention_mask"] - decoder_input_ids = inputs["decoder_input_ids"] - decoder_attention_mask = inputs["decoder_attention_mask"] - traced_model = torch.jit.trace( - model, (input_ids, attention_mask, decoder_input_ids, decoder_attention_mask) - ) + config.output_hidden_states = True + + tf_model = model_class(config) + pt_model = pt_model_class(config) + + # Check we can load pt model in tf and vice-versa with model => model functions + tf_model = transformers.load_pytorch_model_in_tf2_model( + tf_model, + pt_model, + tf_inputs=self._prepare_for_class(inputs_dict, model_class), + ) + pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) + + # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences + pt_model.eval() + pt_inputs_dict = {} + for name, key in self._prepare_for_class(inputs_dict, model_class).items(): + if type(key) == bool: + pt_inputs_dict[name] = key + elif name == "input_values": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) + elif name == "pixel_values": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) + elif name == "input_features": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) else: - input_ids = inputs["input_ids"] - traced_model = torch.jit.trace(model, input_ids) - except RuntimeError: - self.fail("Couldn't trace module.") + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) - with tempfile.TemporaryDirectory() as tmp_dir_name: - pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt") + with torch.no_grad(): + pto = pt_model(**pt_inputs_dict) + tfo = tf_model( + self._prepare_for_class(inputs_dict, model_class), + training=False, + ) - try: - torch.jit.save(traced_model, pt_file_name) - except Exception: - self.fail("Couldn't save module.") + tf_hidden_states = tfo[0].numpy() + pt_hidden_states = pto[0].numpy() - try: - loaded_model = torch.jit.load(pt_file_name) - except Exception: - self.fail("Couldn't load module.") + tf_nans = np.copy(np.isnan(tf_hidden_states)) + pt_nans = np.copy(np.isnan(pt_hidden_states)) - model.to(torch_device) - model.eval() + pt_hidden_states[tf_nans] = 0 + tf_hidden_states[tf_nans] = 0 + pt_hidden_states[pt_nans] = 0 + tf_hidden_states[pt_nans] = 0 - loaded_model.to(torch_device) - loaded_model.eval() + max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states)) + self.assertLessEqual(max_diff, 4e-2) - model_state_dict = model.state_dict() - loaded_model_state_dict = loaded_model.state_dict() + # Check we can load pt model in tf and vice-versa with checkpoint => model functions + with tempfile.TemporaryDirectory() as tmpdirname: + pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") + torch.save(pt_model.state_dict(), pt_checkpoint_path) + tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) - non_persistent_buffers = {} - for key in loaded_model_state_dict.keys(): - if key not in model_state_dict.keys(): - non_persistent_buffers[key] = loaded_model_state_dict[key] + tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") + tf_model.save_weights(tf_checkpoint_path) + pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) - loaded_model_state_dict = { - key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers - } + # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences + pt_model.eval() + pt_inputs_dict = {} + for name, key in self._prepare_for_class(inputs_dict, model_class).items(): + if type(key) == bool: + key = np.array(key, dtype=bool) + pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long) + elif name == "input_values": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) + elif name == "pixel_values": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) + elif name == "input_features": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) + else: + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) + + with torch.no_grad(): + pto = pt_model(**pt_inputs_dict) + tfo = tf_model(self._prepare_for_class(inputs_dict, model_class)) + tfo = tfo[0].numpy() + pto = pto[0].numpy() + tf_nans = np.copy(np.isnan(tfo)) + pt_nans = np.copy(np.isnan(pto)) - self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys())) + pto[tf_nans] = 0 + tfo[tf_nans] = 0 + pto[pt_nans] = 0 + tfo[pt_nans] = 0 - model_buffers = list(model.buffers()) - for non_persistent_buffer in non_persistent_buffers.values(): - found_buffer = False - for i, model_buffer in enumerate(model_buffers): - if torch.equal(non_persistent_buffer, model_buffer): - found_buffer = True - break + max_diff = np.amax(np.abs(tfo - pto)) + self.assertLessEqual(max_diff, 4e-2) - self.assertTrue(found_buffer) - model_buffers.pop(i) + def test_compile_tf_model(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + max_input = getattr(self.model_tester, "max_position_embeddings", 512) + optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") - models_equal = True - for layer_name, p1 in model_state_dict.items(): - if layer_name in loaded_model_state_dict: - p2 = loaded_model_state_dict[layer_name] - if p1.data.ne(p2.data).sum() > 0: - models_equal = False + for model_class in self.all_model_classes: + if model_class.__name__ in [ + "TFSpeech2TextModel", + "TFSpeech2TextForConditionalGeneration", + ]: + inputs = { + "decoder_input_ids": tf.keras.Input( + batch_shape=(2, max_input), + name="decoder_input_ids", + dtype="int32", + ), + "input_features": tf.keras.Input( + batch_shape=( + 2, + max_input, + self.model_tester.input_feat_per_channel * self.model_tester.input_channels, + ), + name="input_features", + dtype="float32", + ), + } + elif self.is_encoder_decoder: + inputs = { + "decoder_input_ids": tf.keras.Input( + batch_shape=(2, max_input), + name="decoder_input_ids", + dtype="int32", + ), + "input_ids": tf.keras.Input( + batch_shape=(2, max_input), + name="input_ids", + dtype="int32", + ), + } + # TODO: A better way to handle vision models + elif model_class.__name__ in [ + "TFViTModel", + "TFViTForImageClassification", + "TFCLIPVisionModel", + ]: + inputs = tf.keras.Input( + batch_shape=( + 3, + self.model_tester.num_channels, + self.model_tester.image_size, + self.model_tester.image_size, + ), + name="pixel_values", + dtype="float32", + ) + elif model_class.__name__ in ["TFCLIPModel"]: + inputs = { + "input_ids": tf.keras.Input( + batch_shape=(3, max_input), + name="input_ids", + dtype="int32", + ), + "pixel_values": tf.keras.Input( + batch_shape=( + 3, + self.model_tester.vision_model_tester.num_channels, + self.model_tester.vision_model_tester.image_size, + self.model_tester.vision_model_tester.image_size, + ), + name="pixel_values", + dtype="float32", + ), + } + elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): + inputs = tf.keras.Input( + batch_shape=(4, 2, max_input), + name="input_ids", + dtype="int32", + ) + else: + inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32") - self.assertTrue(models_equal) + # Prepare our model + model = model_class(config) + model(self._prepare_for_class(inputs_dict, model_class)) # Model must be called before saving. + # Let's load it from the disk to be sure we can use pretrained weights + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname, saved_model=False) + model = model_class.from_pretrained(tmpdirname) - def test_torch_fx(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - self._create_and_check_torch_fx_tracing(config, inputs_dict) + outputs_dict = model(inputs) + hidden_states = outputs_dict[0] - def test_torch_fx_output_loss(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - self._create_and_check_torch_fx_tracing(config, inputs_dict, output_loss=True) + # Add a dense layer on top to test integration with other keras modules + outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states) - def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False): - if not is_torch_fx_available() or not self.fx_compatible: - return + # Compile extended model + extended_model = tf.keras.Model(inputs=[inputs], outputs=[outputs]) + extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) - configs_no_init = _config_zero_init(config) # To be sure we have no Nan - configs_no_init.return_dict = False + def test_keyword_and_dict_args(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - model.to(torch_device) - model.eval() - inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss) + model = model_class(config) + inputs = self._prepare_for_class(inputs_dict, model_class) - try: - if model.config.is_encoder_decoder: - model.config.use_cache = False # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward - labels = inputs.get("labels", None) - input_names = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"] - if labels is not None: - input_names.append("labels") - filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names} - - model_output = model(**filtered_inputs) - - traced_model = symbolic_trace(model, input_names) - traced_output = traced_model(**filtered_inputs) - else: - input_names = ["input_ids", "attention_mask", "token_type_ids"] - input_ids = inputs["input_ids"] - - labels = inputs.get("labels", None) - start_positions = inputs.get("start_positions", None) - end_positions = inputs.get("end_positions", None) - if labels is not None: - input_names.append("labels") - if start_positions is not None: - input_names.append("start_positions") - if end_positions is not None: - input_names.append("end_positions") - - filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names} - input_names = filtered_inputs.keys() - - model_output = model(**filtered_inputs) - - rank = len(input_ids.shape) - if rank not in [2, 3]: - raise NotImplementedError( - f"symbolic_trace automatic parameters inference not implemented for input of rank {rank}." - ) + outputs_dict = model(inputs) - traced_model = symbolic_trace(model, input_names) - traced_output = traced_model(**filtered_inputs) + inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + outputs_keywords = model(**inputs_keywords) + output_dict = outputs_dict[0].numpy() + output_keywords = outputs_keywords[0].numpy() - except RuntimeError: - self.fail("Couldn't trace module.") + self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6) - def flatten_output(output): - flatten = [] - for x in output: - if isinstance(x, (tuple, list)): - flatten += flatten_output(x) - elif not isinstance(x, torch.Tensor): - continue - else: - flatten.append(x) - return flatten + def test_attention_outputs(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + decoder_seq_length = getattr( + self.model_tester, + "decoder_seq_length", + self.model_tester.seq_length, + ) + encoder_seq_length = getattr( + self.model_tester, + "encoder_seq_length", + self.model_tester.seq_length, + ) + decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length) + encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) - model_output = flatten_output(model_output) - traced_output = flatten_output(traced_output) - num_outputs = len(model_output) + def check_decoder_attentions_output(outputs): + out_len = len(outputs) + self.assertEqual(min(out_len % 2, out_len % 5), 0) # differentiation due to newly added cross_attentions + decoder_attentions = outputs.decoder_attentions + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + decoder_key_length, + ], + ) - for i in range(num_outputs): - self.assertTrue( - torch.allclose(model_output[i], traced_output[i]), - f"traced {i}th output doesn't match model {i}th output for {model_class}", - ) + def check_encoder_attentions_output(outputs): + attentions = [ + t.numpy() for t in (outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions) + ] + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + encoder_seq_length, + encoder_key_length, + ], + ) + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["use_cache"] = False + config.output_hidden_states = False + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + out_len = len(outputs) + self.assertEqual(config.output_hidden_states, False) + check_encoder_attentions_output(outputs) + + if self.is_encoder_decoder: + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + self.assertEqual(config.output_hidden_states, False) + check_decoder_attentions_output(outputs) + + # Check that output attentions can also be changed via the config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + self.assertEqual(config.output_hidden_states, False) + check_encoder_attentions_output(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + config.output_hidden_states = True + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs)) + self.assertEqual(model.config.output_hidden_states, True) + check_encoder_attentions_output(outputs) def test_headmasking(self): if not self.test_head_masking: return - global_rng.seed(42) - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - global_rng.seed() + random.Random().seed(42) + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + random.Random().seed() inputs_dict["output_attentions"] = True config.output_hidden_states = True configs_no_init = _config_zero_init(config) # To be sure we have no Nan for model_class in self.all_model_classes: model = model_class(config=configs_no_init) - model.to(torch_device) - model.eval() # Prepare head_mask - # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) - head_mask = torch.ones( - self.model_tester.num_hidden_layers, - self.model_tester.num_attention_heads, - device=torch_device, + def prepare_layer_head_mask(i, attention_heads, num_hidden_layers): + if i == 0: + return tf.concat( + ( + tf.zeros(1, dtype=tf.float32), + tf.ones(attention_heads - 1, dtype=tf.float32), + ), + 0, + ) + elif i == num_hidden_layers - 1: + return tf.concat( + ( + tf.zeros(attention_heads - 1, dtype=tf.float32), + tf.ones(1, dtype=tf.float32), + ), + 0, + ) + else: + return tf.ones(attention_heads, dtype=tf.float32) + + head_mask = tf.stack( + [ + prepare_layer_head_mask(i, config.num_attention_heads, config.num_hidden_layers) + for i in range(config.num_hidden_layers) + ], + 0, ) - head_mask[0, 0] = 0 - head_mask[-1, :-1] = 0 - head_mask.requires_grad_(requires_grad=True) + inputs = self._prepare_for_class(inputs_dict, model_class).copy() inputs["head_mask"] = head_mask if model.config.is_encoder_decoder: - signature = inspect.signature(model.forward) + signature = inspect.signature(model.call) arg_names = [*signature.parameters.keys()] if "decoder_head_mask" in arg_names: # necessary diferentiation because of T5 model inputs["decoder_head_mask"] = head_mask if "cross_attn_head_mask" in arg_names: inputs["cross_attn_head_mask"] = head_mask - outputs = model(**inputs, return_dict=True) - - # Test that we can get a gradient back for importance score computation - output = sum(t.sum() for t in outputs[0]) - output = output.sum() - output.backward() - multihead_outputs = head_mask.grad - self.assertIsNotNone(multihead_outputs) - self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers) + outputs = model(**inputs, return_dict=True) def check_attentions_validity(attentions): # Remove Nan for t in attentions: self.assertLess( - torch.sum(torch.isnan(t)), t.numel() / 4 + (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), + (tf.size(t) / 4).numpy(), ) # Check we don't have more than 25% nans (arbitrary) + attentions = [ - t.masked_fill(torch.isnan(t), 0.0) for t in attentions + tf.where(tf.math.is_nan(t), 0.0, t) for t in attentions ] # remove them (the test is less complete) - self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0) - self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0) - if len(attentions) > 2: # encoder-decoder models have only 2 layers in each module - self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0) - self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0) - self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0) + self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0) + self.assertNotEqual( + tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), + 0.0, + ) + if len(attentions) > 2: # encoder-decodere models have only 2 layers in each modules + self.assertNotEqual( + tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), + 0.0, + ) + self.assertAlmostEqual( + tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), + 0.0, + ) + self.assertNotEqual( + tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), + 0.0, + ) if model.config.is_encoder_decoder: check_attentions_validity(outputs.encoder_attentions) check_attentions_validity(outputs.decoder_attentions) - check_attentions_validity(outputs.cross_attentions) + if "cross_attn_head_mask" in arg_names: + check_attentions_validity(outputs.cross_attentions) else: check_attentions_validity(outputs.attentions) - def test_head_pruning(self): - if not self.test_pruning: - return - - for model_class in self.all_model_classes: - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - if "head_mask" in inputs_dict: - del inputs_dict["head_mask"] - - inputs_dict["output_attentions"] = True - config.output_hidden_states = False - model = model_class(config=config) - model.to(torch_device) - model.eval() - heads_to_prune = { - 0: list(range(1, self.model_tester.num_attention_heads)), - -1: [0], - } - model.prune_heads(heads_to_prune) - with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - - attentions = outputs[-1] - - self.assertEqual(attentions[0].shape[-3], 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) - - def test_head_pruning_save_load_from_pretrained(self): - if not self.test_pruning: - return - - for model_class in self.all_model_classes: - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - if "head_mask" in inputs_dict: - del inputs_dict["head_mask"] - - inputs_dict["output_attentions"] = True - config.output_hidden_states = False - model = model_class(config=config) - model.to(torch_device) - model.eval() - heads_to_prune = { - 0: list(range(1, self.model_tester.num_attention_heads)), - -1: [0], - } - model.prune_heads(heads_to_prune) - - with tempfile.TemporaryDirectory() as temp_dir_name: - model.save_pretrained(temp_dir_name) - model = model_class.from_pretrained(temp_dir_name) - model.to(torch_device) - - with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - attentions = outputs[-1] - self.assertEqual(attentions[0].shape[-3], 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) - - def test_head_pruning_save_load_from_config_init(self): - if not self.test_pruning: - return - - for model_class in self.all_model_classes: - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - if "head_mask" in inputs_dict: - del inputs_dict["head_mask"] - - inputs_dict["output_attentions"] = True - config.output_hidden_states = False - - heads_to_prune = { - 0: list(range(1, self.model_tester.num_attention_heads)), - -1: [0], - } - config.pruned_heads = heads_to_prune - - model = model_class(config=config) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - attentions = outputs[-1] - - self.assertEqual(attentions[0].shape[-3], 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) - - def test_head_pruning_integration(self): - if not self.test_pruning: - return - - for model_class in self.all_model_classes: - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - - if "head_mask" in inputs_dict: - del inputs_dict["head_mask"] - - inputs_dict["output_attentions"] = True - config.output_hidden_states = False - - heads_to_prune = {0: [0], 1: [1, 2]} - config.pruned_heads = heads_to_prune - - model = model_class(config=config) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - attentions = outputs[-1] - - self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) - - with tempfile.TemporaryDirectory() as temp_dir_name: - model.save_pretrained(temp_dir_name) - model = model_class.from_pretrained(temp_dir_name) - model.to(torch_device) - - with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - attentions = outputs[-1] - - self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) - - heads_to_prune = {0: [0], 2: [1, 2]} - model.prune_heads(heads_to_prune) - - with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - attentions = outputs[-1] - - self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) - - self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]}) - def test_hidden_states_output(self): - def check_hidden_states_output(inputs_dict, config, model_class): - model = model_class(config) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - - hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + def check_hidden_states_output(config, inputs_dict, model_class): + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) expected_num_layers = getattr( - self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 - ) - self.assertEqual(len(hidden_states), expected_num_layers) - - if hasattr(self.model_tester, "encoder_seq_length"): - seq_length = self.model_tester.encoder_seq_length - if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1: - seq_length = seq_length * self.model_tester.chunk_length - else: - seq_length = self.model_tester.seq_length - - self.assertListEqual( - list(hidden_states[0].shape[-2:]), - [seq_length, self.model_tester.hidden_size], + self.model_tester, + "expected_num_hidden_layers", + self.model_tester.num_hidden_layers + 1, ) - if config.is_encoder_decoder: - hidden_states = outputs.decoder_hidden_states + if model.config.is_encoder_decoder: + encoder_hidden_states = outputs.encoder_hidden_states + decoder_hidden_states = outputs.decoder_hidden_states - self.assertIsInstance(hidden_states, (list, tuple)) + self.assertEqual(config.output_attentions, False) + self.assertEqual(len(encoder_hidden_states), expected_num_layers) + self.assertListEqual( + list(encoder_hidden_states[0].shape[-2:]), + [ + self.model_tester.seq_length, + self.model_tester.hidden_size, + ], + ) + self.assertEqual(len(decoder_hidden_states), expected_num_layers) + self.assertListEqual( + list(decoder_hidden_states[0].shape[-2:]), + [ + self.model_tester.seq_length, + self.model_tester.hidden_size, + ], + ) + else: + hidden_states = outputs.hidden_states + self.assertEqual(config.output_attentions, False) self.assertEqual(len(hidden_states), expected_num_layers) - seq_len = getattr(self.model_tester, "seq_length", None) - decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) - self.assertListEqual( list(hidden_states[0].shape[-2:]), - [decoder_seq_length, self.model_tester.hidden_size], + [ + self.model_tester.seq_length, + self.model_tester.hidden_size, + ], ) - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True - check_hidden_states_output(inputs_dict, config, model_class) + check_hidden_states_output(config, inputs_dict, model_class) - # check that output_hidden_states also work using config del inputs_dict["output_hidden_states"] config.output_hidden_states = True + check_hidden_states_output(config, inputs_dict, model_class) - check_hidden_states_output(inputs_dict, config, model_class) - - def test_retain_grad_hidden_states_attentions(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - config.output_hidden_states = True - config.output_attentions = True - - # no need to test all models as different heads yield the same functionality - model_class = self.all_model_classes[0] - model = model_class(config) - model.to(torch_device) - - inputs = self._prepare_for_class(inputs_dict, model_class) - - outputs = model(**inputs) - - output = outputs[0] - - if config.is_encoder_decoder: - # Seq2Seq models - encoder_hidden_states = outputs.encoder_hidden_states[0] - encoder_attentions = outputs.encoder_attentions[0] - encoder_hidden_states.retain_grad() - encoder_attentions.retain_grad() - - decoder_hidden_states = outputs.decoder_hidden_states[0] - decoder_attentions = outputs.decoder_attentions[0] - decoder_hidden_states.retain_grad() - decoder_attentions.retain_grad() - - cross_attentions = outputs.cross_attentions[0] - cross_attentions.retain_grad() - - output.flatten()[0].backward(retain_graph=True) - - self.assertIsNotNone(encoder_hidden_states.grad) - self.assertIsNotNone(encoder_attentions.grad) - self.assertIsNotNone(decoder_hidden_states.grad) - self.assertIsNotNone(decoder_attentions.grad) - self.assertIsNotNone(cross_attentions.grad) - else: - # Encoder-/Decoder-only models - hidden_states = outputs.hidden_states[0] - attentions = outputs.attentions[0] - - hidden_states.retain_grad() - attentions.retain_grad() - - output.flatten()[0].backward(retain_graph=True) - - self.assertIsNotNone(hidden_states.grad) - self.assertIsNotNone(attentions.grad) - - def test_feed_forward_chunking(self): - ( - original_config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: - torch.manual_seed(0) - config = copy.deepcopy(original_config) - model = model_class(config) - model.to(torch_device) - model.eval() - - hidden_states_no_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0] - - torch.manual_seed(0) - config.chunk_size_feed_forward = 1 - model = model_class(config) - model.to(torch_device) - model.eval() - - hidden_states_with_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0] - self.assertTrue(torch.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3)) - - def test_resize_position_vector_embeddings(self): - if not self.test_resize_position_embeddings: - return - + def test_model_common_attributes(self): ( - original_config, + config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() + text_in_text_out_models = ( + get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING) + + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING) + + get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING) + ) + speech_in_text_out_models = get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING) for model_class in self.all_model_classes: - config = copy.deepcopy(original_config) model = model_class(config) - model.to(torch_device) - - if self.model_tester.is_training is False: - model.eval() - - max_position_embeddings = config.max_position_embeddings - - # Retrieve the embeddings and clone theme - if model.config.is_encoder_decoder: - encoder_model_embed, decoder_model_embed = model.get_position_embeddings() - encoder_cloned_embeddings = encoder_model_embed.weight.clone() - decoder_cloned_embeddings = decoder_model_embed.weight.clone() - else: - model_embed = model.get_position_embeddings() - cloned_embeddings = model_embed.weight.clone() - - # Check that resizing the position embeddings with a larger max_position_embeddings increases - # the model's postion embeddings size - model.resize_position_embeddings(max_position_embeddings + 10) - self.assertEqual(model.config.max_position_embeddings, max_position_embeddings + 10) - - # Check that it actually resizes the embeddings matrix - if model.config.is_encoder_decoder: - encoder_model_embed, decoder_model_embed = model.get_position_embeddings() - self.assertEqual(encoder_model_embed.weight.shape[0], encoder_cloned_embeddings.shape[0] + 10) - self.assertEqual(decoder_model_embed.weight.shape[0], decoder_cloned_embeddings.shape[0] + 10) - else: - model_embed = model.get_position_embeddings() - self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) - - # Check that the model can still do a forward pass successfully (every parameter should be resized) - model(**self._prepare_for_class(inputs_dict, model_class)) - - # Check that resizing the position embeddings with a smaller max_position_embeddings decreases - # the model's max_position_embeddings - model.resize_position_embeddings(max_position_embeddings - 5) - self.assertEqual(model.config.max_position_embeddings, max_position_embeddings - 5) - - # Check that it actually resizes the embeddings matrix - if model.config.is_encoder_decoder: - encoder_model_embed, decoder_model_embed = model.get_position_embeddings() - self.assertEqual(encoder_model_embed.weight.shape[0], encoder_cloned_embeddings.shape[0] - 5) - self.assertEqual(decoder_model_embed.weight.shape[0], decoder_cloned_embeddings.shape[0] - 5) - else: - model_embed = model.get_position_embeddings() - self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 5) - - # Check that the model can still do a forward pass successfully (every parameter should be resized) - model(**self._prepare_for_class(inputs_dict, model_class)) - - # Check that adding and removing tokens has not modified the first part of the embedding matrix. - models_equal = True - - if model.config.is_encoder_decoder: - for p1, p2 in zip(encoder_cloned_embeddings, encoder_model_embed.weight): - if p1.data.ne(p2.data).sum() > 0: - models_equal = False - for p1, p2 in zip(decoder_cloned_embeddings, decoder_model_embed.weight): - if p1.data.ne(p2.data).sum() > 0: - models_equal = False + assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer) + if model_class in text_in_text_out_models: + x = model.get_output_embeddings() + assert isinstance(x, tf.keras.layers.Layer) + name = model.get_bias() + assert isinstance(name, dict) + for k, v in name.items(): + assert isinstance(v, tf.Variable) + elif model_class in speech_in_text_out_models: + x = model.get_output_embeddings() + assert isinstance(x, tf.keras.layers.Layer) + name = model.get_bias() + assert name is None else: - for p1, p2 in zip(cloned_embeddings, model_embed.weight): - if p1.data.ne(p2.data).sum() > 0: - models_equal = False + x = model.get_output_embeddings() + assert x is None + name = model.get_bias() + assert name is None - self.assertTrue(models_equal) - - def test_resize_tokens_embeddings(self): + def test_determinism(self): ( - original_config, + config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() - if not self.test_resize_embeddings: - return for model_class in self.all_model_classes: - config = copy.deepcopy(original_config) model = model_class(config) - model.to(torch_device) - - if self.model_tester.is_training is False: - model.eval() - - model_vocab_size = config.vocab_size - # Retrieve the embeddings and clone theme - model_embed = model.resize_token_embeddings(model_vocab_size) - cloned_embeddings = model_embed.weight.clone() - - # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size - model_embed = model.resize_token_embeddings(model_vocab_size + 10) - self.assertEqual(model.config.vocab_size, model_vocab_size + 10) - # Check that it actually resizes the embeddings matrix - self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) - # Check that the model can still do a forward pass successfully (every parameter should be resized) - model(**self._prepare_for_class(inputs_dict, model_class)) - - # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size - model_embed = model.resize_token_embeddings(model_vocab_size - 15) - self.assertEqual(model.config.vocab_size, model_vocab_size - 15) - # Check that it actually resizes the embeddings matrix - self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) - - # Check that the model can still do a forward pass successfully (every parameter should be resized) - # Input ids should be clamped to the maximum size of the vocabulary - inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1) - - # make sure that decoder_input_ids are resized as well - if "decoder_input_ids" in inputs_dict: - inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1) - model(**self._prepare_for_class(inputs_dict, model_class)) - - # Check that adding and removing tokens has not modified the first part of the embedding matrix. - models_equal = True - for p1, p2 in zip(cloned_embeddings, model_embed.weight): - if p1.data.ne(p2.data).sum() > 0: - models_equal = False + first, second = ( + model( + self._prepare_for_class(inputs_dict, model_class), + training=False, + )[0], + model( + self._prepare_for_class(inputs_dict, model_class), + training=False, + )[0], + ) + out_1 = first.numpy() + out_2 = second.numpy() + out_1 = out_1[~np.isnan(out_1)] + out_2 = out_2[~np.isnan(out_2)] + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) - self.assertTrue(models_equal) + def test_model_outputs_equivalence(self): - def test_resize_embeddings_untied(self): ( - original_config, + config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() - if not self.test_resize_embeddings: - return - - original_config.tie_word_embeddings = False - - # if model cannot untied embeddings -> leave test - if original_config.tie_word_embeddings: - return - - for model_class in self.all_model_classes: - config = copy.deepcopy(original_config) - model = model_class(config).to(torch_device) - - # if no output embeddings -> leave test - if model.get_output_embeddings() is None: - continue - - # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size - model_vocab_size = config.vocab_size - model.resize_token_embeddings(model_vocab_size + 10) - self.assertEqual(model.config.vocab_size, model_vocab_size + 10) - output_embeds = model.get_output_embeddings() - self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10) - # Check bias if present - if output_embeds.bias is not None: - self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10) - # Check that the model can still do a forward pass successfully (every parameter should be resized) - model(**self._prepare_for_class(inputs_dict, model_class)) - - # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size - model.resize_token_embeddings(model_vocab_size - 15) - self.assertEqual(model.config.vocab_size, model_vocab_size - 15) - # Check that it actually resizes the embeddings matrix - output_embeds = model.get_output_embeddings() - self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15) - # Check bias if present - if output_embeds.bias is not None: - self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15) - # Check that the model can still do a forward pass successfully (every parameter should be resized) - # Input ids should be clamped to the maximum size of the vocabulary - inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1) - if "decoder_input_ids" in inputs_dict: - inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1) - # Check that the model can still do a forward pass successfully (every parameter should be resized) - model(**self._prepare_for_class(inputs_dict, model_class)) - - def test_model_common_attributes(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), (nn.Embedding, AdaptiveEmbedding)) - model.set_input_embeddings(nn.Embedding(10, 10)) - x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, nn.Linear)) - - def test_model_main_input_name(self): - for model_class in self.all_model_classes: - model_signature = inspect.signature(getattr(model_class, "forward")) - # The main input is the name of the argument after `self` - observed_main_input_name = list(model_signature.parameters.keys())[1] - self.assertEqual(model_class.main_input_name, observed_main_input_name) - - def test_correct_missing_keys(self): - if not self.test_missing_keys: - return - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - base_model_prefix = model.base_model_prefix - - if hasattr(model, base_model_prefix): - with tempfile.TemporaryDirectory() as temp_dir_name: - model.base_model.save_pretrained(temp_dir_name) - model, loading_info = model_class.from_pretrained(temp_dir_name, output_loading_info=True) - with self.subTest(msg=f"Missing keys for {model.__class__.__name__}"): - self.assertGreater(len(loading_info["missing_keys"]), 0) - - def test_tie_model_weights(self): - if not self.test_torchscript: - return - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - def check_same_values(layer_1, layer_2): - equal = True - for p1, p2 in zip(layer_1.weight, layer_2.weight): - if p1.data.ne(p2.data).sum() > 0: - equal = False - return equal - - for model_class in self.all_model_classes: - config.torchscript = True - model_not_tied = model_class(config) - if model_not_tied.get_output_embeddings() is None: - continue - - config_tied = copy.deepcopy(config) - config_tied.torchscript = False - model_tied = model_class(config_tied) - params_tied = list(model_tied.parameters()) - # Check that the embedding layer and decoding layer are the same in size and in value - # self.assertTrue(check_same_values(embeddings, decoding)) - - # # Check that after modification, they remain the same. - # embeddings.weight.data.div_(2) - # # Check that the embedding layer and decoding layer are the same in size and in value - # self.assertTrue(embeddings.weight.shape, decoding.weight.shape) - # self.assertTrue(check_same_values(embeddings, decoding)) - - # # Check that after modification, they remain the same. - # decoding.weight.data.div_(4) - # # Check that the embedding layer and decoding layer are the same in size and in value - # self.assertTrue(embeddings.weight.shape, decoding.weight.shape) - # self.assertTrue(check_same_values(embeddings, decoding)) - - # Check that after resize they remain tied. - model_tied.resize_token_embeddings(config.vocab_size + 10) - params_tied_2 = list(model_tied.parameters()) - self.assertEqual(len(params_tied_2), len(params_tied)) - - # decoding.weight.data.mul_(20) - # # Check that the embedding layer and decoding layer are the same in size and in value - # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape) - # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head)) - - def test_model_outputs_equivalence(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - def set_nan_tensor_to_zero(t): - t[t != t] = 0 - return t - - def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): - with torch.no_grad(): - tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) - dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple() - - def recursive_check(tuple_object, dict_object): - if isinstance(tuple_object, (List, Tuple)): - for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): - recursive_check(tuple_iterable_value, dict_iterable_value) - elif isinstance(tuple_object, Dict): - for tuple_iterable_value, dict_iterable_value in zip( - tuple_object.values(), dict_object.values() - ): - recursive_check(tuple_iterable_value, dict_iterable_value) - elif tuple_object is None: - return - else: - self.assertTrue( - torch.allclose( - set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5 - ), - msg=f"Tuple and dict output are not equal. Difference: {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`: {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}.", - ) + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (List, Tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + all(tf.equal(tuple_object, dict_object)), + msg=f"Tuple and dict output are not equal. Difference: {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}", + ) recursive_check(tuple_output, dict_output) for model_class in self.all_model_classes: model = model_class(config) - model.to(torch_device) - model.eval() tuple_inputs = self._prepare_for_class(inputs_dict, model_class) dict_inputs = self._prepare_for_class(inputs_dict, model_class) @@ -1436,262 +943,22 @@ def recursive_check(tuple_object, dict_object): tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence( - model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} + model, + tuple_inputs, + dict_inputs, + {"output_hidden_states": True, "output_attentions": True}, ) - @is_pt_tf_cross_test - def test_pt_tf_model_equivalence(self): - import numpy as np - import tensorflow as tf - - import transformers - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - tf_model_class_name = "TF" + model_class.__name__ # Add the "TF" at the beginning - - if not hasattr(transformers, tf_model_class_name): - # transformers does not have TF version yet - return - - tf_model_class = getattr(transformers, tf_model_class_name) - - config.output_hidden_states = True - - tf_model = tf_model_class(config) - pt_model = model_class(config) - - # make sure only tf inputs are forward that actually exist in function args - tf_input_keys = set(inspect.signature(tf_model.call).parameters.keys()) - - # remove all head masks - tf_input_keys.discard("head_mask") - tf_input_keys.discard("cross_attn_head_mask") - tf_input_keys.discard("decoder_head_mask") - - pt_inputs = self._prepare_for_class(inputs_dict, model_class) - pt_inputs = {k: v for k, v in pt_inputs.items() if k in tf_input_keys} - - # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences - pt_model.eval() - tf_inputs_dict = {} - for key, tensor in pt_inputs.items(): - # skip key that does not exist in tf - if type(tensor) == bool: - tf_inputs_dict[key] = tensor - elif key == "input_values": - tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32) - elif key == "pixel_values": - tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32) - elif key == "input_features": - tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32) - else: - tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32) - - # Check we can load pt model in tf and vice-versa with model => model functions - tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict) - pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model).to(torch_device) - - # need to rename encoder-decoder "inputs" for PyTorch - # if "inputs" in pt_inputs_dict and self.is_encoder_decoder: - # pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") - - with torch.no_grad(): - pto = pt_model(**pt_inputs) - tfo = tf_model(tf_inputs_dict, training=False) - - tf_hidden_states = tfo[0].numpy() - pt_hidden_states = pto[0].cpu().numpy() - - tf_nans = np.copy(np.isnan(tf_hidden_states)) - pt_nans = np.copy(np.isnan(pt_hidden_states)) - - pt_hidden_states[tf_nans] = 0 - tf_hidden_states[tf_nans] = 0 - pt_hidden_states[pt_nans] = 0 - tf_hidden_states[pt_nans] = 0 - - max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states)) - self.assertLessEqual(max_diff, 4e-2) - - # Check we can load pt model in tf and vice-versa with checkpoint => model functions - with tempfile.TemporaryDirectory() as tmpdirname: - pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") - torch.save(pt_model.state_dict(), pt_checkpoint_path) - tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) - - tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") - tf_model.save_weights(tf_checkpoint_path) - pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) - pt_model = pt_model.to(torch_device) - - # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences - pt_model.eval() - tf_inputs_dict = {} - for key, tensor in pt_inputs.items(): - # skip key that does not exist in tf - if type(tensor) == bool: - tensor = np.array(tensor, dtype=bool) - tf_inputs_dict[key] = tf.convert_to_tensor(tensor, dtype=tf.int32) - elif key == "input_values": - tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32) - elif key == "pixel_values": - tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32) - elif key == "input_features": - tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32) - else: - tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32) - - # need to rename encoder-decoder "inputs" for PyTorch - # if "inputs" in pt_inputs_dict and self.is_encoder_decoder: - # pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") - - with torch.no_grad(): - pto = pt_model(**pt_inputs) - - tfo = tf_model(tf_inputs_dict) - tfo = tfo[0].numpy() - pto = pto[0].cpu().numpy() - tf_nans = np.copy(np.isnan(tfo)) - pt_nans = np.copy(np.isnan(pto)) - - pto[tf_nans] = 0 - tfo[tf_nans] = 0 - pto[pt_nans] = 0 - tfo[pt_nans] = 0 - - max_diff = np.amax(np.abs(tfo - pto)) - self.assertLessEqual(max_diff, 4e-2) - - def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float): - diff = np.abs((a - b)).max() - self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).") - - @is_pt_flax_cross_test - def test_equivalence_pt_to_flax(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - with self.subTest(model_class.__name__): - - # load PyTorch class - pt_model = model_class(config).eval() - # Flax models don't use the `use_cache` option and cache is not returned as a default. - # So we disable `use_cache` here for PyTorch model. - pt_model.config.use_cache = False - - fx_model_class_name = "Flax" + model_class.__name__ - - if not hasattr(transformers, fx_model_class_name): - return - - fx_model_class = getattr(transformers, fx_model_class_name) - - # load Flax class - fx_model = fx_model_class(config, dtype=jnp.float32) - # make sure only flax inputs are forward that actually exist in function args - fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys() - - # prepare inputs - pt_inputs = self._prepare_for_class(inputs_dict, model_class) - - # remove function args that don't exist in Flax - pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys} - - fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model) - fx_model.params = fx_state - - with torch.no_grad(): - pt_outputs = pt_model(**pt_inputs).to_tuple() - - # convert inputs to Flax - fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)} - fx_outputs = fx_model(**fx_inputs).to_tuple() - self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch") - for fx_output, pt_output in zip(fx_outputs, pt_outputs): - self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2) - - with tempfile.TemporaryDirectory() as tmpdirname: - pt_model.save_pretrained(tmpdirname) - fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, from_pt=True) - - fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple() - self.assertEqual( - len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch" - ) - for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs): - self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2) - - @is_pt_flax_cross_test - def test_equivalence_flax_to_pt(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - with self.subTest(model_class.__name__): - # load corresponding PyTorch class - pt_model = model_class(config).eval() - - # So we disable `use_cache` here for PyTorch model. - pt_model.config.use_cache = False - - fx_model_class_name = "Flax" + model_class.__name__ - - if not hasattr(transformers, fx_model_class_name): - # no flax model exists for this class - return - - fx_model_class = getattr(transformers, fx_model_class_name) - - # load Flax class - fx_model = fx_model_class(config, dtype=jnp.float32) - # make sure only flax inputs are forward that actually exist in function args - fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys() - - pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params) - - # make sure weights are tied in PyTorch - pt_model.tie_weights() - - # prepare inputs - pt_inputs = self._prepare_for_class(inputs_dict, model_class) - - # remove function args that don't exist in Flax - pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys} - - with torch.no_grad(): - pt_outputs = pt_model(**pt_inputs).to_tuple() - - fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)} - - fx_outputs = fx_model(**fx_inputs).to_tuple() - self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch") - - for fx_output, pt_output in zip(fx_outputs, pt_outputs): - self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2) - - with tempfile.TemporaryDirectory() as tmpdirname: - fx_model.save_pretrained(tmpdirname) - pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True) - - with torch.no_grad(): - pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple() - - self.assertEqual( - len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch" - ) - for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded): - self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2) - def test_inputs_embeds(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) - model.to(torch_device) - model.eval() - inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + inputs = copy.deepcopy(inputs_dict) if not self.is_encoder_decoder: input_ids = inputs["input_ids"] @@ -1702,261 +969,511 @@ def test_inputs_embeds(self): del inputs["input_ids"] inputs.pop("decoder_input_ids", None) - wte = model.get_input_embeddings() if not self.is_encoder_decoder: - inputs["inputs_embeds"] = wte(input_ids) + inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids) else: - inputs["inputs_embeds"] = wte(encoder_input_ids) - inputs["decoder_inputs_embeds"] = wte(decoder_input_ids) + inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids) + inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids) - with torch.no_grad(): - model(**inputs)[0] + inputs = self._prepare_for_class(inputs, model_class) + + model(inputs) - @require_torch_multi_gpu - def test_multi_gpu_data_parallel_forward(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + def test_numpy_arrays_inputs(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() - # some params shouldn't be scattered by nn.DataParallel - # so just remove them if they are present. - blacklist_non_batched_params = ["head_mask", "decoder_head_mask", "cross_attn_head_mask"] - for k in blacklist_non_batched_params: - inputs_dict.pop(k, None) + def prepare_numpy_arrays(inputs_dict): + inputs_np_dict = {} + for k, v in inputs_dict.items(): + if tf.is_tensor(v): + inputs_np_dict[k] = v.numpy() + else: + inputs_np_dict[k] = np.array(k) - # move input tensors to cuda:O - for k, v in inputs_dict.items(): - if torch.is_tensor(v): - inputs_dict[k] = v.to(0) + return inputs_np_dict for model_class in self.all_model_classes: - model = model_class(config=config) - model.to(0) - model.eval() + model = model_class(config) - # Wrap model in nn.DataParallel - model = nn.DataParallel(model) - with torch.no_grad(): - _ = model(**self._prepare_for_class(inputs_dict, model_class)) + inputs = self._prepare_for_class(inputs_dict, model_class) + inputs_np = prepare_numpy_arrays(inputs) - @require_torch_multi_gpu - def test_model_parallelization(self): - if not self.test_model_parallel: - return + output_for_dict_input = model(inputs_np) + output_for_kw_input = model(**inputs_np) + self.assert_outputs_same(output_for_dict_input, output_for_kw_input) - # a candidate for testing_utils - def get_current_gpu_memory_use(): - """returns a list of cuda memory allocations per GPU in MBs""" + def test_resize_token_embeddings(self): + if not self.test_resize_embeddings: + return + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() - per_device_memory = [] - for id in range(torch.cuda.device_count()): - with torch.cuda.device(id): - per_device_memory.append(torch.cuda.memory_allocated() >> 20) + def _get_word_embedding_weight(model, embedding_layer): + embeds = getattr(embedding_layer, "weight", None) + if embeds is not None: + return embeds - return per_device_memory + embeds = getattr(embedding_layer, "decoder", None) + if embeds is not None: + return embeds - # Needs a large model to see the difference. - config = self.model_tester.get_large_model_config() + model(model.dummy_inputs) - for model_class in self.all_parallelizable_model_classes: - torch.cuda.empty_cache() + embeds = getattr(embedding_layer, "weight", None) + if embeds is not None: + return embeds - # 1. single gpu memory load + unload + memory measurements - # Retrieve initial memory usage (can easily be ~0.6-1.5GB if cuda-kernels have been preloaded by previous tests) - memory_at_start = get_current_gpu_memory_use() + embeds = getattr(embedding_layer, "decoder", None) + if embeds is not None: + return embeds - # Put model on device 0 and take a memory snapshot - model = model_class(config) - model.to("cuda:0") - memory_after_model_load = get_current_gpu_memory_use() + return None - # The memory use on device 0 should be higher than it was initially. - self.assertGreater(memory_after_model_load[0], memory_at_start[0]) + for model_class in self.all_model_classes: + for size in [config.vocab_size - 10, config.vocab_size + 10, None]: + # build the embeddings + model = model_class(config=config) + old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings()) + old_bias = model.get_bias() + old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings()) + # reshape the embeddings + model.resize_token_embeddings(size) + new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings()) + new_bias = model.get_bias() + new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings()) + + # check that the resized embeddings size matches the desired size. + assert_size = size if size is not None else config.vocab_size + self.assertEqual(new_input_embeddings.shape[0], assert_size) + + # check that weights remain the same after resizing + models_equal = True + for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()): + if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: + models_equal = False + self.assertTrue(models_equal) + + if old_bias is not None and new_bias is not None: + for old_weight, new_weight in zip(old_bias.values(), new_bias.values()): + self.assertEqual(new_weight.shape[0], assert_size) + + models_equal = True + for p1, p2 in zip(old_weight.value(), new_weight.value()): + if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: + models_equal = False + self.assertTrue(models_equal) + + if old_output_embeddings is not None and new_output_embeddings is not None: + self.assertEqual(new_output_embeddings.shape[0], assert_size) + self.assertEqual( + new_output_embeddings.shape[1], + old_output_embeddings.shape[1], + ) - del model - gc.collect() - torch.cuda.empty_cache() + models_equal = True + for p1, p2 in zip( + old_output_embeddings.value(), + new_output_embeddings.value(), + ): + if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: + models_equal = False + self.assertTrue(models_equal) - # 2. MP test - # it's essential to re-calibrate the usage before the next stage - memory_at_start = get_current_gpu_memory_use() + def test_lm_head_model_random_no_beam_search_generate(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + input_ids = inputs_dict.get("input_ids", None) - # Spread model layers over multiple devices + # iterate over all generative models + for model_class in self.all_generative_model_classes: model = model_class(config) - model.parallelize() - memory_after_parallelization = get_current_gpu_memory_use() - # Assert that the memory use on all devices is higher than it was when loaded only on CPU - for n in range(torch.cuda.device_count()): - self.assertGreater(memory_after_parallelization[n], memory_at_start[n]) - - # Assert that the memory use of device 0 is lower than it was when the entire model was loaded on it - self.assertLess(memory_after_parallelization[0], memory_after_model_load[0]) - - # Assert that the memory use of device 1 is higher than it was when the entire model was loaded - # on device 0 and device 1 wasn't used at all - self.assertGreater(memory_after_parallelization[1], memory_after_model_load[1]) - - del model - gc.collect() - torch.cuda.empty_cache() - - @require_torch_multi_gpu - def test_model_parallel_equal_results(self): - if not self.test_model_parallel: - return + if config.bos_token_id is None: + # if bos token id is not defined model needs input_ids + with self.assertRaises(AssertionError): + model.generate(do_sample=True, max_length=5) + # num_return_sequences = 1 + self._check_generated_ids(model.generate(input_ids, do_sample=True)) + elif model_class.__name__ not in ["TFSpeech2TextForConditionalGeneration"]: + # Models with non-text inputs won't work here; num_return_sequences = 1 + self._check_generated_ids(model.generate(do_sample=True, max_length=5)) + + with self.assertRaises(ValueError): + # generating multiple sequences when no beam search generation + # is not allowed as it would always generate the same sequences + model.generate(input_ids, do_sample=False, num_return_sequences=2) + + # num_return_sequences > 1, sample + self._check_generated_ids(model.generate(input_ids, do_sample=True, num_return_sequences=2)) + + # check bad words tokens language generation + # create list of 1-seq bad token and list of 2-seq of bad tokens + bad_words_ids = [ + self._generate_random_bad_tokens(1, model), + self._generate_random_bad_tokens(2, model), + ] + output_tokens = model.generate( + input_ids, + do_sample=True, + bad_words_ids=bad_words_ids, + num_return_sequences=2, + ) + # only count generated tokens + generated_ids = output_tokens[:, input_ids.shape[-1] :] + self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + def test_lm_head_model_no_beam_search_generate_dict_outputs(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + input_ids = inputs_dict.get("input_ids", None) + if input_ids is None: + input_ids = inputs_dict.get("input_features", None) - for model_class in self.all_parallelizable_model_classes: - inputs_dict = self._prepare_for_class(inputs_dict, model_class) + # iterate over all generative models + for model_class in self.all_generative_model_classes: + model = model_class(config) + output_greedy = model.generate( + input_ids, + do_sample=False, + output_scores=True, + output_hidden_states=True, + output_attentions=True, + return_dict_in_generate=True, + ) + output_sample = model.generate( + input_ids, + do_sample=True, + output_scores=True, + output_hidden_states=True, + output_attentions=True, + return_dict_in_generate=True, + ) - def cast_to_device(dictionary, device): - output = {} - for k, v in dictionary.items(): - if isinstance(v, torch.Tensor): - output[k] = v.to(device) - else: - output[k] = v + if model.config.is_encoder_decoder: + self.assertIsInstance(output_greedy, TFGreedySearchEncoderDecoderOutput) + self.assertIsInstance(output_sample, TFSampleEncoderDecoderOutput) + else: + self.assertIsInstance(output_greedy, TFGreedySearchDecoderOnlyOutput) + self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput) - return output + def test_lm_head_model_random_beam_search_generate(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + input_ids = inputs_dict.get("input_ids", None) + for model_class in self.all_generative_model_classes: model = model_class(config) - output = model(**cast_to_device(inputs_dict, "cpu")) - - model.parallelize() - - parallel_output = model(**cast_to_device(inputs_dict, "cuda:0")) - for value, parallel_value in zip(output, parallel_output): - if isinstance(value, torch.Tensor): - self.assertTrue(torch.allclose(value, parallel_value.to("cpu"), atol=1e-7)) - elif isinstance(value, (Tuple, List)): - for value_, parallel_value_ in zip(value, parallel_value): - self.assertTrue(torch.allclose(value_, parallel_value_.to("cpu"), atol=1e-7)) + if config.bos_token_id is None: + # if bos token id is not defined model needs input_ids, num_return_sequences = 1 + self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2)) + else: + # num_return_sequences = 1 + self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2)) + + with self.assertRaises(AssertionError): + # generating more sequences than having beams leads is not possible + model.generate( + input_ids, + do_sample=False, + num_return_sequences=3, + num_beams=2, + ) - @require_torch_multi_gpu - def test_model_parallel_beam_search(self): - if not self.test_model_parallel: - return + # num_return_sequences > 1, sample + self._check_generated_ids( + model.generate( + input_ids, + do_sample=True, + num_beams=2, + num_return_sequences=2, + ) + ) + # num_return_sequences > 1, greedy + self._check_generated_ids( + model.generate( + input_ids, + do_sample=False, + num_beams=2, + num_return_sequences=2, + ) + ) - all_generative_and_parallelizable_model_classes = tuple( - set(self.all_generative_model_classes).intersection(self.all_parallelizable_model_classes) - ) + # check bad words tokens language generation + # create list of 1-seq bad token and list of 2-seq of bad tokens + bad_words_ids = [ + self._generate_random_bad_tokens(1, model), + self._generate_random_bad_tokens(2, model), + ] + output_tokens = model.generate( + input_ids, + do_sample=False, + bad_words_ids=bad_words_ids, + num_beams=2, + num_return_sequences=2, + ) + # only count generated tokens + generated_ids = output_tokens[:, input_ids.shape[-1] :] + self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + def test_lm_head_model_beam_search_generate_dict_outputs(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + input_ids = inputs_dict.get("input_ids", None) + if input_ids is None: + input_ids = inputs_dict.get("input_features", None) - for model_class in all_generative_and_parallelizable_model_classes: - inputs_dict = self._prepare_for_class(inputs_dict, model_class) + # iterate over all generative models + for model_class in self.all_generative_model_classes: model = model_class(config) + output_beam_search = model.generate( + input_ids, + num_beams=2, + do_sample=False, + output_scores=True, + output_hidden_states=True, + output_attentions=True, + return_dict_in_generate=True, + ) + output_beam_sample = model.generate( + input_ids, + num_beams=2, + do_sample=True, + output_scores=True, + output_hidden_states=True, + output_attentions=True, + return_dict_in_generate=True, + ) - def cast_to_device(dictionary, device): - output = {} - for k, v in dictionary.items(): - if isinstance(v, torch.Tensor): - output[k] = v.to(device) - else: - output[k] = v - - return output - - model.parallelize() - model.generate(**cast_to_device(inputs_dict, "cuda:0"), num_beams=2) - - def test_problem_types(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - problem_types = [ - {"title": "multi_label_classification", "num_labels": 2, "dtype": torch.float}, - {"title": "single_label_classification", "num_labels": 1, "dtype": torch.long}, - {"title": "regression", "num_labels": 1, "dtype": torch.float}, - ] + if model.config.is_encoder_decoder: + self.assertIsInstance(output_beam_search, TFBeamSearchEncoderDecoderOutput) + self.assertIsInstance(output_beam_sample, TFBeamSampleEncoderDecoderOutput) + else: + self.assertIsInstance(output_beam_search, TFBeamSearchDecoderOnlyOutput) + self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput) + def test_loss_computation(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): - continue - - for problem_type in problem_types: - with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"): + model = model_class(config) + if getattr(model, "hf_compute_loss", None): + # The number of elements in the loss should be the same as the number of elements in the label + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + added_label = prepared_for_class[ + sorted( + list(prepared_for_class.keys() - inputs_dict.keys()), + reverse=True, + )[0] + ] + loss_size = tf.size(added_label) - config.problem_type = problem_type["title"] - config.num_labels = problem_type["num_labels"] + if model.__class__ in get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING): + # if loss is causal lm loss, labels are shift, so that one label per batch + # is cut + loss_size = loss_size - self.model_tester.batch_size - model = model_class(config) - model.to(torch_device) - model.train() + # Test that model correctly compute the loss with kwargs + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + possible_input_names = { + "input_ids", + "pixel_values", + "input_features", + } + input_name = possible_input_names.intersection(set(prepared_for_class)).pop() + model_input = prepared_for_class.pop(input_name) + + loss = model(model_input, **prepared_for_class)[0] + self.assertEqual(loss.shape, [loss_size]) + + # Test that model correctly compute the loss with a dict + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + loss = model(prepared_for_class)[0] + self.assertEqual(loss.shape, [loss_size]) + + # Test that model correctly compute the loss with a tuple + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + + # Get keys that were added with the _prepare_for_class function + label_keys = prepared_for_class.keys() - inputs_dict.keys() + signature = inspect.signature(model.call).parameters + signature_names = list(signature.keys()) + + # Create a dictionary holding the location of the tensors in the tuple + tuple_index_mapping = {0: input_name} + for label_key in label_keys: + label_key_index = signature_names.index(label_key) + tuple_index_mapping[label_key_index] = label_key + sorted_tuple_index_mapping = sorted(tuple_index_mapping.items()) + # Initialize a list with their default values, update the values and convert to a tuple + list_input = [] + + for name in signature_names: + if name != "kwargs": + list_input.append(signature[name].default) + + for index, value in sorted_tuple_index_mapping: + list_input[index] = prepared_for_class[value] + + tuple_input = tuple(list_input) + + # Send to model + loss = model(tuple_input[:-1])[0] + + self.assertEqual(loss.shape, [loss_size]) + + def test_generate_with_headmasking(self): + attention_names = [ + "encoder_attentions", + "decoder_attentions", + "cross_attentions", + ] + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() - inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + for model_class in self.all_generative_model_classes: + model = model_class(config) - if problem_type["num_labels"] > 1: - inputs["labels"] = inputs["labels"].unsqueeze(1).repeat(1, problem_type["num_labels"]) + # We want to test only encoder-decoder models + if not config.is_encoder_decoder: + continue - inputs["labels"] = inputs["labels"].to(problem_type["dtype"]) + head_masking = { + "head_mask": tf.zeros((config.encoder_layers, config.encoder_attention_heads)), + "decoder_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)), + "cross_attn_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)), + } - # This tests that we do not trigger the warning form PyTorch "Using a target size that is different - # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure - # they have the same size." which is a symptom something in wrong for the regression problem. - # See https://github.com/huggingface/transformers/issues/11780 - with warnings.catch_warnings(record=True) as warning_list: - loss = model(**inputs).loss - for w in warning_list: - if "Using a target size that is different to the input size" in str(w.message): - raise ValueError( - f"Something is going wrong in the regression problem: intercepted {w.message}" - ) + signature = inspect.signature(model.call) + if set(head_masking.keys()) < set([*signature.parameters.keys()]): + continue - loss.backward() + for attn_name, (name, mask) in zip(attention_names, head_masking.items()): + out = model.generate( + inputs_dict["input_ids"], + num_beams=1, + max_length=inputs_dict["input_ids"] + 5, + output_attentions=True, + return_dict_in_generate=True, + **{name: mask}, + ) + # We check the state of decoder_attentions and cross_attentions just from the last step + attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1] + self.assertEqual(sum([tf.reduce_sum(w).numpy() for w in attn_weights]), 0.0) def test_load_with_mismatched_shapes(self): if not self.test_mismatched_shapes: return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): + if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): continue with self.subTest(msg=f"Testing {model_class}"): with tempfile.TemporaryDirectory() as tmp_dir: model = model_class(config) + inputs = self._prepare_for_class(inputs_dict, model_class) + _ = model(**inputs) model.save_pretrained(tmp_dir) # Fails when we don't set ignore_mismatched_sizes=True - with self.assertRaises(RuntimeError): - new_model = AutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42) - with self.assertRaises(RuntimeError): - new_model_without_prefix = AutoModel.from_pretrained(tmp_dir, vocab_size=10) - - logger = logging.get_logger("transformers.modeling_utils") + with self.assertRaises(ValueError): + new_model = TFAutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42) + with self.assertRaises(ValueError): + new_model_without_prefix = TFAutoModel.from_pretrained(tmp_dir, vocab_size=10) + logger = logging.get_logger("transformers.modeling_tf_utils") with CaptureLogger(logger) as cl: - new_model = AutoModelForSequenceClassification.from_pretrained( + new_model = TFAutoModelForSequenceClassification.from_pretrained( tmp_dir, num_labels=42, ignore_mismatched_sizes=True ) self.assertIn("the shapes did not match", cl.out) - new_model.to(torch_device) - inputs = self._prepare_for_class(inputs_dict, model_class) + logits = new_model(**inputs).logits self.assertEqual(logits.shape[1], 42) with CaptureLogger(logger) as cl: - new_model_without_prefix = AutoModel.from_pretrained( + new_model_without_prefix = TFAutoModel.from_pretrained( tmp_dir, vocab_size=10, ignore_mismatched_sizes=True ) self.assertIn("the shapes did not match", cl.out) + + # Although Tf models always have a prefix pointing to `MainLayer`, + # we still add this "without prefix" test to keep a consistency between tf and pt tests. input_ids = ids_tensor((2, 8), 10) - new_model_without_prefix.to(torch_device) if self.is_encoder_decoder: new_model_without_prefix(input_ids, decoder_input_ids=input_ids) else: new_model_without_prefix(input_ids) + def test_model_main_input_name(self): + for model_class in self.all_model_classes: + model_signature = inspect.signature(getattr(model_class, "call")) + # The main input is the name of the argument after `self` + observed_main_input_name = list(model_signature.parameters.keys())[1] + self.assertEqual(model_class.main_input_name, observed_main_input_name) -global_rng = random.Random() - - -def ids_tensor(shape, vocab_size, rng=None, name=None): - # Creates a random int32 tensor of the shape within the vocab size + def _generate_random_bad_tokens(self, num_bad_tokens, model): + # special tokens cannot be bad tokens + special_tokens = [] + if model.config.bos_token_id is not None: + special_tokens.append(model.config.bos_token_id) + if model.config.pad_token_id is not None: + special_tokens.append(model.config.pad_token_id) + if model.config.eos_token_id is not None: + special_tokens.append(model.config.eos_token_id) + + # create random bad tokens that are not special tokens + bad_tokens = [] + while len(bad_tokens) < num_bad_tokens: + token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0] + if token not in special_tokens: + bad_tokens.append(token) + return bad_tokens + + def _check_generated_ids(self, output_ids): + for token_id in output_ids[0].numpy().tolist(): + self.assertGreaterEqual(token_id, 0) + self.assertLess(token_id, self.model_tester.vocab_size) + + def _check_match_tokens(self, generated_ids, bad_words_ids): + # for all bad word tokens + for bad_word_ids in bad_words_ids: + # for all slices in batch + for generated_ids_slice in generated_ids: + # for all word idx + for i in range(len(bad_word_ids), len(generated_ids_slice)): + # if tokens match + if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids: + return True + return False + + +def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): + """Creates a random int32 tensor of the shape within the vocab size.""" if rng is None: - rng = global_rng + rng = random.Random() total_dims = 1 for dim in shape: @@ -1966,20 +1483,28 @@ def ids_tensor(shape, vocab_size, rng=None, name=None): for _ in range(total_dims): values.append(rng.randint(0, vocab_size - 1)) - return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous() + output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32) + + return output -def random_attention_mask(shape, rng=None, name=None): - attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None) +def random_attention_mask(shape, rng=None, name=None, dtype=None): + attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype) # make sure that at least one token is attended to for each batch - attn_mask[:, -1] = 1 + attn_mask = tf.concat( + [ + tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), + attn_mask[:, 1:], + ], + axis=1, + ) return attn_mask -def floats_tensor(shape, scale=1.0, rng=None, name=None): +def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None): """Creates a random float32 tensor""" if rng is None: - rng = global_rng + rng = random.Random() total_dims = 1 for dim in shape: @@ -1989,128 +1514,134 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None): for _ in range(total_dims): values.append(rng.random() * scale) - return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous() - - -@require_torch -class ModelUtilsTest(TestCasePlus): - @slow - def test_model_from_pretrained(self): - for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: - config = BertConfig.from_pretrained(model_name) - self.assertIsNotNone(config) - self.assertIsInstance(config, PretrainedConfig) - - model = BertModel.from_pretrained(model_name) - model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True) - self.assertIsNotNone(model) - self.assertIsInstance(model, PreTrainedModel) - - self.assertEqual(len(loading_info["missing_keys"]), 0) - self.assertEqual(len(loading_info["unexpected_keys"]), 8) - self.assertEqual(len(loading_info["mismatched_keys"]), 0) - self.assertEqual(len(loading_info["error_msgs"]), 0) - - config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True) + return tf.reshape( + tf.constant(values, dtype=dtype if dtype is not None else tf.float32), + shape=shape, + ) - # Not sure this is the intended behavior. TODO fix Lysandre & Thom - config.name_or_path = model_name - model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True) - self.assertEqual(model.config.output_hidden_states, True) - self.assertEqual(model.config, config) - - def test_model_from_pretrained_with_different_pretrained_model_name(self): - model = T5ForConditionalGeneration.from_pretrained(TINY_T5) - self.assertIsNotNone(model) - - logger = logging.get_logger("transformers.configuration_utils") - with CaptureLogger(logger) as cl: - BertModel.from_pretrained(TINY_T5) - self.assertTrue("You are using a model of type t5 to instantiate a model of type bert" in cl.out) - - @require_torch - def test_model_from_config_torch_dtype(self): - # test that the model can be instantiated with dtype of user's choice - as long as it's a - # float dtype. To make it happen config.torch_dtype needs to be set before instantiating the - # model from the config object. - - config = T5Config.from_pretrained(TINY_T5) - model = AutoModel.from_config(config) - # XXX: isn't supported - # model = T5ForConditionalGeneration.from_config(config) - self.assertEqual(model.dtype, torch.float32) - - model = AutoModel.from_config(config, torch_dtype=torch.float16) - self.assertEqual(model.dtype, torch.float16) - - # torch.set_default_dtype() supports only float dtypes, so will fail with non-float type - with self.assertRaises(ValueError): - model = AutoModel.from_config(config, torch_dtype=torch.int64) - - @require_torch - def test_model_from_pretrained_torch_dtype(self): - # test that the model can be instantiated with dtype of either - # 1. explicit from_pretrained's torch_dtype argument - # 2. via autodiscovery by looking at model weights (torch_dtype="auto") - # so if a model.half() was saved, we want it to be instantiated as such. - # - # test an explicit model class, but also AutoModel separately as the latter goes through a different code path - model_path = self.get_auto_remove_tmp_dir() - - # baseline - we know TINY_T5 is fp32 model - model = T5ForConditionalGeneration.from_pretrained(TINY_T5) - self.assertEqual(model.dtype, torch.float32) - - # test the default fp32 save_pretrained => from_pretrained cycle - model.save_pretrained(model_path) - model = T5ForConditionalGeneration.from_pretrained(model_path) - self.assertEqual(model.dtype, torch.float32) - # test with auto-detection - model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto") - self.assertEqual(model.dtype, torch.float32) - - # test forced loading in fp16 (even though the weights are in fp32) - model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16) - self.assertEqual(model.dtype, torch.float16) - - # test fp16 save_pretrained, loaded with auto-detection - model = model.half() - model.save_pretrained(model_path) - model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto") - self.assertEqual(model.config.torch_dtype, torch.float16) - self.assertEqual(model.dtype, torch.float16) - - # tests `config.torch_dtype` saving - with open(f"{model_path}/config.json") as f: - config_dict = json.load(f) - self.assertEqual(config_dict["torch_dtype"], "float16") - - # test fp16 save_pretrained, loaded with the explicit fp16 - model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16) - self.assertEqual(model.dtype, torch.float16) - - # test AutoModel separately as it goes through a different path - # test auto-detection - model = AutoModel.from_pretrained(TINY_T5, torch_dtype="auto") - self.assertEqual(model.dtype, torch.float32) - # test forcing an explicit dtype - model = AutoModel.from_pretrained(TINY_T5, torch_dtype=torch.float16) - self.assertEqual(model.dtype, torch.float16) - - def test_no_super_init_config_and_model(self): - config = NoSuperInitConfig(attribute=32) - model = NoSuperInitModel(config) +@require_tf +class UtilsFunctionsTest(unittest.TestCase): + + # tests whether the top_k_top_p_filtering function behaves as expected + def test_top_k_top_p_filtering(self): + logits = tf.convert_to_tensor( + [ + [ + 8.2220991, # 3rd highest value; idx. 0 + -0.5620044, + 5.23229752, + 4.0386393, + -6.8798378, + -0.54785802, + -3.2012153, + 2.92777176, + 1.88171953, + 7.35341276, # 5th highest value; idx. 9 + 8.43207833, # 2nd highest value; idx. 10 + -9.85711836, + -5.96209236, + -1.13039161, + -7.1115294, + -0.8369633, + -5.3186408, + 7.06427407, + 0.81369344, + -0.82023817, + -5.9179796, + 0.58813443, + -6.99778438, + 4.71551189, + -0.18771637, + 7.44020759, # 4th highest value; idx. 25 + 9.38450987, # 1st highest value; idx. 26 + 2.12662941, + -9.32562038, + 2.35652522, + ], # cummulative prob of 5 highest values <= 0.6 + [ + 0.58425518, + 4.53139238, + -5.57510464, + -6.28030699, + -7.19529503, + -4.02122551, + 1.39337037, + -6.06707057, + 1.59480517, + -9.643119, + 0.03907799, + 0.67231762, + -8.88206726, + 6.27115922, # 4th highest value; idx. 13 + 2.28520723, + 4.82767506, + 4.30421368, + 8.8275313, # 2nd highest value; idx. 17 + 5.44029958, # 5th highest value; idx. 18 + -4.4735794, + 7.38579536, # 3rd highest value; idx. 20 + -2.91051663, + 2.61946077, + -2.5674762, + -9.48959302, + -4.02922645, + -1.35416918, + 9.67702323, # 1st highest value; idx. 27 + -5.89478553, + 1.85370467, + ], # cummulative prob of 5 highest values <= 0.6 + ], + dtype=tf.float32, + ) - with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained(tmp_dir) + non_inf_expected_idx = tf.convert_to_tensor( + [ + [0, 0], + [0, 9], + [0, 10], + [0, 25], + [0, 26], + [1, 13], + [1, 17], + [1, 18], + [1, 20], + [1, 27], + ], + dtype=tf.int32, + ) # expected non filtered idx as noted above + + non_inf_expected_output = tf.convert_to_tensor( + [ + 8.222099, + 7.3534126, + 8.432078, + 7.4402075, + 9.38451, + 6.271159, + 8.827531, + 5.4402995, + 7.3857956, + 9.677023, + ], + dtype=tf.float32, + ) # expected non filtered values as noted above + + output = tf_top_k_top_p_filtering(logits, top_k=10, top_p=0.6, min_tokens_to_keep=4) + + non_inf_output = output[output != -float("inf")] + non_inf_idx = tf.cast( + tf.where(tf.not_equal(output, tf.constant(-float("inf"), dtype=tf.float32))), + dtype=tf.int32, + ) - model = NoSuperInitModel.from_pretrained(tmp_dir) + tf.debugging.assert_near(non_inf_output, non_inf_expected_output, rtol=1e-12) + tf.debugging.assert_equal(non_inf_idx, non_inf_expected_idx) -@require_torch +@require_tf @is_staging_test -class ModelPushToHubTester(unittest.TestCase): +class TFModelPushToHubTester(unittest.TestCase): @classmethod def setUpClass(cls): cls._token = login(username=USER, password=PASS) @@ -2118,78 +1649,77 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): try: - delete_repo(token=cls._token, name="test-model") + delete_repo(token=cls._token, name="test-model-tf") except HTTPError: pass try: - delete_repo(token=cls._token, name="test-model-org", organization="valid_org") - except HTTPError: - pass - - try: - delete_repo(token=cls._token, name="test-dynamic-model") - except HTTPError: - pass - - try: - delete_repo(token=cls._token, name="test-dynamic-model-config") + delete_repo( + token=cls._token, + name="test-model-tf-org", + organization="valid_org", + ) except HTTPError: pass def test_push_to_hub(self): config = BertConfig( - vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, ) - model = BertModel(config) + model = TFBertModel(config) + # Make sure model is properly initialized + _ = model(model.dummy_inputs) with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained(os.path.join(tmp_dir, "test-model"), push_to_hub=True, use_auth_token=self._token) + model.save_pretrained( + os.path.join(tmp_dir, "test-model-tf"), + push_to_hub=True, + use_auth_token=self._token, + ) + + new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf") + models_equal = True + for p1, p2 in zip(model.weights, new_model.weights): + if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: + models_equal = False + self.assertTrue(models_equal) - new_model = BertModel.from_pretrained(f"{USER}/test-model") - for p1, p2 in zip(model.parameters(), new_model.parameters()): - self.assertTrue(torch.equal(p1, p2)) + def test_push_to_hub_with_model_card(self): + config = BertConfig( + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + ) + model = TFBertModel(config) + with tempfile.TemporaryDirectory() as tmp_dir: + model.push_to_hub(os.path.join(tmp_dir, "test-model-tf")) + self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "test-model-card-tf", "README.md"))) def test_push_to_hub_in_organization(self): config = BertConfig( - vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, ) - model = BertModel(config) + model = TFBertModel(config) with tempfile.TemporaryDirectory() as tmp_dir: model.save_pretrained( - os.path.join(tmp_dir, "test-model-org"), + os.path.join(tmp_dir, "test-model-tf-org"), push_to_hub=True, use_auth_token=self._token, organization="valid_org", ) - new_model = BertModel.from_pretrained("valid_org/test-model-org") - for p1, p2 in zip(model.parameters(), new_model.parameters()): - self.assertTrue(torch.equal(p1, p2)) - - def test_push_to_hub_dynamic_model(self): - CustomConfig.register_for_auto_class() - CustomModel.register_for_auto_class() - - config = CustomConfig(hidden_size=32) - model = CustomModel(config) - - with tempfile.TemporaryDirectory() as tmp_dir: - repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-model", use_auth_token=self._token) - model.save_pretrained(tmp_dir) - # checks - self.assertDictEqual( - config.auto_map, - {"AutoConfig": "custom_configuration.CustomConfig", "AutoModel": "custom_modeling.CustomModel"}, - ) - - repo.push_to_hub() - - new_model = AutoModel.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True) - # Can't make an isinstance check because the new_model is from the CustomModel class of a dynamic module - self.assertEqual(new_model.__class__.__name__, "CustomModel") - for p1, p2 in zip(model.parameters(), new_model.parameters()): - self.assertTrue(torch.equal(p1, p2)) - - config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True) - new_model = AutoModel.from_config(config, trust_remote_code=True) - self.assertEqual(new_model.__class__.__name__, "CustomModel") + new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org") + models_equal = True + for p1, p2 in zip(model.weights, new_model.weights): + if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: + models_equal = False + self.assertTrue(models_equal) From ad5d7e0153ab9895205a238f0ee03f9c2259833d Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 24 Feb 2022 15:43:01 +0530 Subject: [PATCH 56/65] chore: revert to the previous tests/test_modeling_common.py. --- tests/test_modeling_common.py | 3172 ++++++++++++++++++++------------- 1 file changed, 1961 insertions(+), 1211 deletions(-) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index bf707b762c394..348ffcd2c4490 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -13,152 +13,183 @@ # See the License for the specific language governing permissions and # limitations under the License. - import copy +import gc import inspect import json import os +import os.path import random +import sys import tempfile import unittest -from importlib import import_module -from typing import List, Tuple +import warnings +from pathlib import Path +from typing import Dict, List, Tuple + +import numpy as np -from huggingface_hub import delete_repo, login +import transformers +from huggingface_hub import Repository, delete_repo, login from requests.exceptions import HTTPError -from transformers import is_tf_available +from transformers import ( + AutoConfig, + AutoModel, + AutoModelForSequenceClassification, + PretrainedConfig, + is_torch_available, + logging, +) +from transformers.file_utils import WEIGHTS_NAME, is_flax_available, is_torch_fx_available from transformers.models.auto import get_values -from transformers.testing_utils import tooslow # noqa: F401 from transformers.testing_utils import ( PASS, USER, CaptureLogger, - _tf_gpu_memory_limit, + TestCasePlus, + is_pt_flax_cross_test, is_pt_tf_cross_test, is_staging_test, - require_tf, - require_tf2onnx, + require_torch, + require_torch_multi_gpu, slow, + torch_device, ) -from transformers.utils import logging -if is_tf_available(): - import numpy as np - import tensorflow as tf +sys.path.append(str(Path(__file__).parent.parent / "utils")) + +from test_module.custom_configuration import CustomConfig, NoSuperInitConfig # noqa E402 + +if is_torch_available(): + import torch + from torch import nn + + from test_module.custom_modeling import CustomModel, NoSuperInitModel from transformers import ( - TF_MODEL_FOR_CAUSAL_LM_MAPPING, - TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, - TF_MODEL_FOR_MASKED_LM_MAPPING, - TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING, - TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, - TF_MODEL_FOR_PRETRAINING_MAPPING, - TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, - TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, - TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, - TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, - TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + BERT_PRETRAINED_MODEL_ARCHIVE_LIST, + MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, + MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING, + MODEL_FOR_MASKED_LM_MAPPING, + MODEL_FOR_MULTIPLE_CHOICE_MAPPING, + MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, + MODEL_FOR_QUESTION_ANSWERING_MAPPING, + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, + MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + MODEL_MAPPING, + AdaptiveEmbedding, BertConfig, - TFAutoModel, - TFAutoModelForSequenceClassification, - TFBertModel, - TFSharedEmbeddings, - tf_top_k_top_p_filtering, + BertModel, + PreTrainedModel, + T5Config, + T5ForConditionalGeneration, ) - from transformers.generation_tf_utils import ( - TFBeamSampleDecoderOnlyOutput, - TFBeamSampleEncoderDecoderOutput, - TFBeamSearchDecoderOnlyOutput, - TFBeamSearchEncoderDecoderOutput, - TFGreedySearchDecoderOnlyOutput, - TFGreedySearchEncoderDecoderOutput, - TFSampleDecoderOnlyOutput, - TFSampleEncoderDecoderOutput, + +if is_flax_available(): + import jax.numpy as jnp + from transformers.modeling_flax_pytorch_utils import ( + convert_pytorch_state_dict_to_flax, + load_flax_weights_in_pytorch_model, ) - if _tf_gpu_memory_limit is not None: - gpus = tf.config.list_physical_devices("GPU") - for gpu in gpus: - # Restrict TensorFlow to only allocate x GB of memory on the GPUs - try: - tf.config.set_logical_device_configuration( - gpu, - [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)], - ) - logical_gpus = tf.config.list_logical_devices("GPU") - print("Logical GPUs", logical_gpus) - except RuntimeError as e: - # Virtual devices must be set before GPUs have been initialized - print(e) +if is_torch_fx_available(): + from transformers.utils.fx import symbolic_trace def _config_zero_init(config): configs_no_init = copy.deepcopy(config) for key in configs_no_init.__dict__.keys(): - if "_range" in key or "_std" in key: - setattr(configs_no_init, key, 0.0) + if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key: + setattr(configs_no_init, key, 1e-10) return configs_no_init -@require_tf -class TFModelTesterMixin: +TINY_T5 = "patrickvonplaten/t5-tiny-random" + + +@require_torch +class ModelTesterMixin: model_tester = None all_model_classes = () all_generative_model_classes = () - test_mismatched_shapes = True + fx_compatible = False + test_torchscript = True + test_pruning = True test_resize_embeddings = True + test_resize_position_embeddings = False test_head_masking = True + test_mismatched_shapes = True + test_missing_keys = True + test_model_parallel = False is_encoder_decoder = False - def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict: + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = copy.deepcopy(inputs_dict) - - if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): + if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING): inputs_dict = { - k: tf.tile( - tf.expand_dims(v, 1), - (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1), - ) - if isinstance(v, tf.Tensor) and v.ndim > 0 + k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous() + if isinstance(v, torch.Tensor) and v.ndim > 1 else v for k, v in inputs_dict.items() } if return_labels: - if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): - inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32) - elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING): - inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) - inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) + if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING): + inputs_dict["labels"] = torch.ones( + self.model_tester.batch_size, + dtype=torch.long, + device=torch_device, + ) + elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING): + inputs_dict["start_positions"] = torch.zeros( + self.model_tester.batch_size, + dtype=torch.long, + device=torch_device, + ) + inputs_dict["end_positions"] = torch.zeros( + self.model_tester.batch_size, + dtype=torch.long, + device=torch_device, + ) elif model_class in [ - *get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING), - *get_values(TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING), + *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING), + *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING), + *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING), ]: - inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) - elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING): - inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) + inputs_dict["labels"] = torch.zeros( + self.model_tester.batch_size, + dtype=torch.long, + device=torch_device, + ) elif model_class in [ - *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING), - *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING), - *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING), - *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING), - *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING), - *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING), + *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING), + *get_values(MODEL_FOR_CAUSAL_LM_MAPPING), + *get_values(MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING), + *get_values(MODEL_FOR_MASKED_LM_MAPPING), + *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING), ]: - inputs_dict["labels"] = tf.zeros( + inputs_dict["labels"] = torch.zeros( ( self.model_tester.batch_size, self.model_tester.seq_length, ), - dtype=tf.int32, + dtype=torch.long, + device=torch_device, + ) + elif model_class in get_values(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING): + num_patches = self.model_tester.image_size // self.model_tester.patch_size + inputs_dict["bool_masked_pos"] = torch.zeros( + (self.model_tester.batch_size, num_patches ** 2), + dtype=torch.long, + device=torch_device, ) return inputs_dict - def test_initialization(self): - pass - def test_save_load(self): ( config, @@ -167,16 +198,28 @@ def test_save_load(self): for model_class in self.all_model_classes: model = model_class(config) - outputs = model(self._prepare_for_class(inputs_dict, model_class)) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + out_2 = outputs[0].cpu().numpy() + out_2[np.isnan(out_2)] = 0 with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname, saved_model=False) + model.save_pretrained(tmpdirname) model = model_class.from_pretrained(tmpdirname) - after_outputs = model(self._prepare_for_class(inputs_dict, model_class)) + model.to(torch_device) + with torch.no_grad(): + after_outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - self.assert_outputs_same(after_outputs, outputs) + # Make sure we don't have nans + out_1 = after_outputs[0].cpu().numpy() + out_1[np.isnan(out_1)] = 0 + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) - def test_save_load_config(self): + def test_save_load_keys_to_ignore_on_save(self): ( config, inputs_dict, @@ -184,417 +227,288 @@ def test_save_load_config(self): for model_class in self.all_model_classes: model = model_class(config) - outputs = model(self._prepare_for_class(inputs_dict, model_class)) - model_config = model.get_config() - # make sure that returned config is jsonifiable, which is required by keras - json.dumps(model_config) - new_model = model_class.from_config(model.get_config()) - # make sure it also accepts a normal config - _ = model_class.from_config(model.config) - _ = new_model(self._prepare_for_class(inputs_dict, model_class)) # Build model - new_model.set_weights(model.get_weights()) - after_outputs = new_model(self._prepare_for_class(inputs_dict, model_class)) - - self.assert_outputs_same(after_outputs, outputs) - - def test_forward_signature(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - signature = inspect.signature(model.call) - # signature.parameters is an OrderedDict => so arg_names order is deterministic - arg_names = [*signature.parameters.keys()] + _keys_to_ignore_on_save = getattr(model, "_keys_to_ignore_on_save", None) + if _keys_to_ignore_on_save is None: + continue - if model.config.is_encoder_decoder: - expected_arg_names = [ - "input_ids", - "attention_mask", - "decoder_input_ids", - "decoder_attention_mask", - ] - expected_arg_names.extend( - ["head_mask", "decoder_head_mask"] if "head_mask" and "decoder_head_mask" in arg_names else [] - ) - # Necessary to handle BART with newly added cross_attn_head_mask - expected_arg_names.extend( - ["cross_attn_head_mask", "encoder_outputs"] - if "cross_attn_head_mask" in arg_names - else ["encoder_outputs"] + # check the keys are in the original state_dict + for k in _keys_to_ignore_on_save: + self.assertIn( + k, + model.state_dict().keys(), + "\n".join(model.state_dict().keys()), ) - self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) - else: - expected_arg_names = ["input_ids"] - self.assertListEqual(arg_names[:1], expected_arg_names) + # check that certain keys didn't get saved with the model + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + output_model_file = os.path.join(tmpdirname, WEIGHTS_NAME) + state_dict_saved = torch.load(output_model_file) + for k in _keys_to_ignore_on_save: + self.assertNotIn( + k, + state_dict_saved.keys(), + "\n".join(state_dict_saved.keys()), + ) - def test_onnx_compliancy(self): - if not self.test_onnx: - return + # Test we can load the state dict in the model, necessary for the checkpointing API in Trainer. + load_result = model.load_state_dict(state_dict_saved, strict=False) + self.assertTrue( + len(load_result.missing_keys) == 0 + or set(load_result.missing_keys) == set(model._keys_to_ignore_on_save) + ) + self.assertTrue(len(load_result.unexpected_keys) == 0) + def test_gradient_checkpointing_backward_compatibility(self): ( config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() - INTERNAL_OPS = [ - "Assert", - "AssignVariableOp", - "EmptyTensorList", - "ReadVariableOp", - "ResourceGather", - "TruncatedNormal", - "VarHandleOp", - "VarIsInitializedOp", - ] - onnx_ops = [] - - with open(os.path.join(".", "utils", "tf_ops", "onnx.json")) as f: - onnx_opsets = json.load(f)["opsets"] - - for i in range(1, self.onnx_min_opset + 1): - onnx_ops.extend(onnx_opsets[str(i)]) for model_class in self.all_model_classes: - model_op_names = set() + if not model_class.supports_gradient_checkpointing: + continue - with tf.Graph().as_default() as g: - model = model_class(config) - model(model.dummy_inputs) + config.gradient_checkpointing = True + model = model_class(config) + self.assertTrue(model.is_gradient_checkpointing) - for op in g.get_operations(): - model_op_names.add(op.node_def.op) + def test_gradient_checkpointing_enable_disable(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() - model_op_names = sorted(model_op_names) - incompatible_ops = [] + for model_class in self.all_model_classes: + if not model_class.supports_gradient_checkpointing: + continue - for op in model_op_names: - if op not in onnx_ops and op not in INTERNAL_OPS: - incompatible_ops.append(op) + # at init model should have gradient checkpointing disabled + model = model_class(config) + self.assertFalse(model.is_gradient_checkpointing) - self.assertEqual(len(incompatible_ops), 0, incompatible_ops) + # check enable works + model.gradient_checkpointing_enable() + self.assertTrue(model.is_gradient_checkpointing) - @require_tf2onnx - @slow - def test_onnx_runtime_optimize(self): - if not self.test_onnx: - return + # check disable works + model.gradient_checkpointing_disable() + self.assertFalse(model.is_gradient_checkpointing) - import onnxruntime - import tf2onnx + def _mock_init_weights(self, module): + if hasattr(module, "weight") and module.weight is not None: + module.weight.data.fill_(3) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.fill_(3) + def test_save_load_fast_init_from_base(self): ( config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() + base_class = MODEL_MAPPING[config.__class__] + + if isinstance(base_class, tuple): + base_class = base_class[0] for model_class in self.all_model_classes: - model = model_class(config) - model(model.dummy_inputs) + if model_class == base_class: + continue - onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset) + # make a copy of model class to not break future tests + # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class + class CopyClass(model_class): + pass - onnxruntime.InferenceSession(onnx_model_proto.SerializeToString()) + model_class_copy = CopyClass - def test_keras_save_load(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + # make sure that all keys are expected for test + model_class_copy._keys_to_ignore_on_load_missing = [] - tf_main_layer_classes = set( - module_member - for model_class in self.all_model_classes - for module in (import_module(model_class.__module__),) - for module_member_name in dir(module) - if module_member_name.endswith("MainLayer") - # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`. - and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] - for module_member in (getattr(module, module_member_name),) - if isinstance(module_member, type) - and tf.keras.layers.Layer in module_member.__bases__ - and getattr(module_member, "_keras_serializable", False) - ) - for main_layer_class in tf_main_layer_classes: - # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter - if "T5" in main_layer_class.__name__: - # Take the same values than in TFT5ModelTester for this shared layer - shared = TFSharedEmbeddings(99, 32, name="shared") - config.use_cache = inputs_dict.pop("use_cache", None) - main_layer = main_layer_class(config, embed_tokens=shared) - else: - main_layer = main_layer_class(config) + # make init deterministic, but make sure that + # non-initialized weights throw errors nevertheless + model_class_copy._init_weights = self._mock_init_weights - symbolic_inputs = { - name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() - } + model = base_class(config) + state_dict = model.state_dict() - model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) - outputs = model(inputs_dict) + # this will often delete a single weight of a multi-weight module + # to test an edge case + random_key_to_del = random.choice(list(state_dict.keys())) + del state_dict[random_key_to_del] + # check that certain keys didn't get saved with the model with tempfile.TemporaryDirectory() as tmpdirname: - filepath = os.path.join(tmpdirname, "keras_model.h5") - model.save(filepath) - if "T5" in main_layer_class.__name__: - model = tf.keras.models.load_model( - filepath, - custom_objects={ - main_layer_class.__name__: main_layer_class, - "TFSharedEmbeddings": TFSharedEmbeddings, - }, - ) - else: - model = tf.keras.models.load_model( - filepath, - custom_objects={main_layer_class.__name__: main_layer_class}, - ) - assert isinstance(model, tf.keras.Model) - after_outputs = model(inputs_dict) - self.assert_outputs_same(after_outputs, outputs) - - def assert_outputs_same(self, after_outputs, outputs): - # Make sure we don't have nans - if isinstance(after_outputs, tf.Tensor): - out_1 = after_outputs.numpy() - elif isinstance(after_outputs, dict): - out_1 = after_outputs[list(after_outputs.keys())[0]].numpy() - else: - out_1 = after_outputs[0].numpy() - out_2 = outputs[0].numpy() - self.assertEqual(out_1.shape, out_2.shape) - out_1 = out_1[~np.isnan(out_1)] - out_2 = out_2[~np.isnan(out_2)] - max_diff = np.amax(np.abs(out_1 - out_2)) - self.assertLessEqual(max_diff, 1e-5) + model.save_pretrained(tmpdirname) + torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin")) - @is_pt_tf_cross_test - def test_pt_tf_model_equivalence(self): - import torch + model_fast_init = model_class_copy.from_pretrained(tmpdirname) + model_slow_init = model_class_copy.from_pretrained(tmpdirname, _fast_init=False) - import transformers + for key in model_fast_init.state_dict().keys(): + max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item() + self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical") + def test_save_load_fast_init_to_base(self): ( config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() + base_class = MODEL_MAPPING[config.__class__] - for model_class in self.all_model_classes: - pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beginning - pt_model_class = getattr(transformers, pt_model_class_name) - - config.output_hidden_states = True + if isinstance(base_class, tuple): + base_class = base_class[0] - tf_model = model_class(config) - pt_model = pt_model_class(config) + for model_class in self.all_model_classes: - # Check we can load pt model in tf and vice-versa with model => model functions - tf_model = transformers.load_pytorch_model_in_tf2_model( - tf_model, - pt_model, - tf_inputs=self._prepare_for_class(inputs_dict, model_class), - ) - pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) + if model_class == base_class: + continue - # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences - pt_model.eval() - pt_inputs_dict = {} - for name, key in self._prepare_for_class(inputs_dict, model_class).items(): - if type(key) == bool: - pt_inputs_dict[name] = key - elif name == "input_values": - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) - elif name == "pixel_values": - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) - elif name == "input_features": - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) - else: - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) + # make a copy of model class to not break future tests + # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class + class CopyClass(base_class): + pass - with torch.no_grad(): - pto = pt_model(**pt_inputs_dict) - tfo = tf_model( - self._prepare_for_class(inputs_dict, model_class), - training=False, - ) + base_class_copy = CopyClass - tf_hidden_states = tfo[0].numpy() - pt_hidden_states = pto[0].numpy() + # make sure that all keys are expected for test + base_class_copy._keys_to_ignore_on_load_missing = [] - tf_nans = np.copy(np.isnan(tf_hidden_states)) - pt_nans = np.copy(np.isnan(pt_hidden_states)) + # make init deterministic, but make sure that + # non-initialized weights throw errors nevertheless + base_class_copy._init_weights = self._mock_init_weights - pt_hidden_states[tf_nans] = 0 - tf_hidden_states[tf_nans] = 0 - pt_hidden_states[pt_nans] = 0 - tf_hidden_states[pt_nans] = 0 + model = model_class(config) + state_dict = model.state_dict() - max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states)) - self.assertLessEqual(max_diff, 4e-2) + # this will often delete a single weight of a multi-weight module + # to test an edge case + random_key_to_del = random.choice(list(state_dict.keys())) + del state_dict[random_key_to_del] - # Check we can load pt model in tf and vice-versa with checkpoint => model functions + # check that certain keys didn't get saved with the model with tempfile.TemporaryDirectory() as tmpdirname: - pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") - torch.save(pt_model.state_dict(), pt_checkpoint_path) - tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) - - tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") - tf_model.save_weights(tf_checkpoint_path) - pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) + model.config.save_pretrained(tmpdirname) + torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin")) - # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences - pt_model.eval() - pt_inputs_dict = {} - for name, key in self._prepare_for_class(inputs_dict, model_class).items(): - if type(key) == bool: - key = np.array(key, dtype=bool) - pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long) - elif name == "input_values": - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) - elif name == "pixel_values": - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) - elif name == "input_features": - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) - else: - pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) + model_fast_init = base_class_copy.from_pretrained(tmpdirname) + model_slow_init = base_class_copy.from_pretrained(tmpdirname, _fast_init=False) - with torch.no_grad(): - pto = pt_model(**pt_inputs_dict) - tfo = tf_model(self._prepare_for_class(inputs_dict, model_class)) - tfo = tfo[0].numpy() - pto = pto[0].numpy() - tf_nans = np.copy(np.isnan(tfo)) - pt_nans = np.copy(np.isnan(pto)) + for key in model_fast_init.state_dict().keys(): + max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item() + self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical") - pto[tf_nans] = 0 - tfo[tf_nans] = 0 - pto[pt_nans] = 0 - tfo[pt_nans] = 0 + def test_initialization(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() - max_diff = np.amax(np.abs(tfo - pto)) - self.assertLessEqual(max_diff, 4e-2) + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) - def test_compile_tf_model(self): + def test_determinism(self): ( config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() - max_input = getattr(self.model_tester, "max_position_embeddings", 512) - optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) - loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") for model_class in self.all_model_classes: - if model_class.__name__ in [ - "TFSpeech2TextModel", - "TFSpeech2TextForConditionalGeneration", - ]: - inputs = { - "decoder_input_ids": tf.keras.Input( - batch_shape=(2, max_input), - name="decoder_input_ids", - dtype="int32", - ), - "input_features": tf.keras.Input( - batch_shape=( - 2, - max_input, - self.model_tester.input_feat_per_channel * self.model_tester.input_channels, - ), - name="input_features", - dtype="float32", - ), - } - elif self.is_encoder_decoder: - inputs = { - "decoder_input_ids": tf.keras.Input( - batch_shape=(2, max_input), - name="decoder_input_ids", - dtype="int32", - ), - "input_ids": tf.keras.Input( - batch_shape=(2, max_input), - name="input_ids", - dtype="int32", - ), - } - # TODO: A better way to handle vision models - elif model_class.__name__ in [ - "TFViTModel", - "TFViTForImageClassification", - "TFCLIPVisionModel", - ]: - inputs = tf.keras.Input( - batch_shape=( - 3, - self.model_tester.num_channels, - self.model_tester.image_size, - self.model_tester.image_size, - ), - name="pixel_values", - dtype="float32", - ) - elif model_class.__name__ in ["TFCLIPModel"]: - inputs = { - "input_ids": tf.keras.Input( - batch_shape=(3, max_input), - name="input_ids", - dtype="int32", - ), - "pixel_values": tf.keras.Input( - batch_shape=( - 3, - self.model_tester.vision_model_tester.num_channels, - self.model_tester.vision_model_tester.image_size, - self.model_tester.vision_model_tester.image_size, - ), - name="pixel_values", - dtype="float32", - ), - } - elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): - inputs = tf.keras.Input( - batch_shape=(4, 2, max_input), - name="input_ids", - dtype="int32", - ) - else: - inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32") - - # Prepare our model model = model_class(config) - model(self._prepare_for_class(inputs_dict, model_class)) # Model must be called before saving. - # Let's load it from the disk to be sure we can use pretrained weights - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname, saved_model=False) - model = model_class.from_pretrained(tmpdirname) + model.to(torch_device) + model.eval() + with torch.no_grad(): + first = model(**self._prepare_for_class(inputs_dict, model_class))[0] + second = model(**self._prepare_for_class(inputs_dict, model_class))[0] + + out_1 = first.cpu().numpy() + out_2 = second.cpu().numpy() + out_1 = out_1[~np.isnan(out_1)] + out_2 = out_2[~np.isnan(out_2)] + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) - outputs_dict = model(inputs) - hidden_states = outputs_dict[0] + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() - # Add a dense layer on top to test integration with other keras modules - outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states) + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] - # Compile extended model - extended_model = tf.keras.Model(inputs=[inputs], outputs=[outputs]) - extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) + if model.config.is_encoder_decoder: + expected_arg_names = [ + "input_ids", + "attention_mask", + "decoder_input_ids", + "decoder_attention_mask", + ] + expected_arg_names.extend( + [ + "head_mask", + "decoder_head_mask", + "cross_attn_head_mask", + "encoder_outputs", + ] + if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names + else ["encoder_outputs"] + ) + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + else: + expected_arg_names = ["input_ids"] + self.assertListEqual(arg_names[:1], expected_arg_names) - def test_keyword_and_dict_args(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + def test_training(self): + if not self.model_tester.is_training: + return for model_class in self.all_model_classes: - model = model_class(config) - inputs = self._prepare_for_class(inputs_dict, model_class) + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True - outputs_dict = model(inputs) + if model_class in get_values(MODEL_MAPPING): + continue - inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) - outputs_keywords = model(**inputs_keywords) - output_dict = outputs_dict[0].numpy() - output_keywords = outputs_keywords[0].numpy() + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + if not self.model_tester.is_training: + return - self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6) + for model_class in self.all_model_classes: + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + config.use_cache = False + config.return_dict = True + + if model_class in get_values(MODEL_MAPPING) or not model_class.supports_gradient_checkpointing: + continue + model = model_class(config) + model.to(torch_device) + model.gradient_checkpointing_enable() + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() def test_attention_outputs(self): ( @@ -602,268 +516,967 @@ def test_attention_outputs(self): inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True - decoder_seq_length = getattr( - self.model_tester, - "decoder_seq_length", - self.model_tester.seq_length, - ) - encoder_seq_length = getattr( - self.model_tester, - "encoder_seq_length", - self.model_tester.seq_length, - ) - decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length) - encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) - - def check_decoder_attentions_output(outputs): - out_len = len(outputs) - self.assertEqual(min(out_len % 2, out_len % 5), 0) # differentiation due to newly added cross_attentions - decoder_attentions = outputs.decoder_attentions - self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) - self.assertListEqual( - list(decoder_attentions[0].shape[-3:]), - [ - self.model_tester.num_attention_heads, - decoder_seq_length, - decoder_key_length, - ], - ) - def check_encoder_attentions_output(outputs): - attentions = [ - t.numpy() for t in (outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions) - ] - self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) - self.assertListEqual( - list(attentions[0].shape[-3:]), - [ - self.model_tester.num_attention_heads, - encoder_seq_length, - encoder_key_length, - ], - ) + seq_len = getattr(self.model_tester, "seq_length", None) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) + encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) + decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length) + encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) + chunk_length = getattr(self.model_tester, "chunk_length", None) + if chunk_length is not None and hasattr(self.model_tester, "num_hashes"): + encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes for model_class in self.all_model_classes: inputs_dict["output_attentions"] = True - inputs_dict["use_cache"] = False - config.output_hidden_states = False + inputs_dict["output_hidden_states"] = False + config.return_dict = True model = model_class(config) - outputs = model(self._prepare_for_class(inputs_dict, model_class)) - out_len = len(outputs) - self.assertEqual(config.output_hidden_states, False) - check_encoder_attentions_output(outputs) - - if self.is_encoder_decoder: - model = model_class(config) - outputs = model(self._prepare_for_class(inputs_dict, model_class)) - self.assertEqual(config.output_hidden_states, False) - check_decoder_attentions_output(outputs) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) - # Check that output attentions can also be changed via the config + # check that output_attentions also work using config del inputs_dict["output_attentions"] config.output_attentions = True model = model_class(config) - outputs = model(self._prepare_for_class(inputs_dict, model_class)) - self.assertEqual(config.output_hidden_states, False) - check_encoder_attentions_output(outputs) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + if chunk_length is not None: + self.assertListEqual( + list(attentions[0].shape[-4:]), + [ + self.model_tester.num_attention_heads, + encoder_seq_length, + chunk_length, + encoder_key_length, + ], + ) + else: + self.assertListEqual( + list(attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + encoder_seq_length, + encoder_key_length, + ], + ) + out_len = len(outputs) + + if self.is_encoder_decoder: + correct_outlen = 5 + + # loss is at first position + if "labels" in inputs_dict: + correct_outlen += 1 # loss is added to beginning + # Question Answering model returns start_logits and end_logits + if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING): + correct_outlen += 1 # start_logits and end_logits instead of only 1 output + if "past_key_values" in outputs: + correct_outlen += 1 # past_key_values have been returned + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + decoder_key_length, + ], + ) + + # cross attentions + cross_attentions = outputs.cross_attentions + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + encoder_key_length, + ], + ) # Check attention is always last and order is fine inputs_dict["output_attentions"] = True - config.output_hidden_states = True + inputs_dict["output_hidden_states"] = True model = model_class(config) - outputs = model(self._prepare_for_class(inputs_dict, model_class)) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs)) - self.assertEqual(model.config.output_hidden_states, True) - check_encoder_attentions_output(outputs) + if hasattr(self.model_tester, "num_hidden_states_types"): + added_hidden_states = self.model_tester.num_hidden_states_types + elif self.is_encoder_decoder: + added_hidden_states = 2 + else: + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) - def test_headmasking(self): - if not self.test_head_masking: - return + self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + if chunk_length is not None: + self.assertListEqual( + list(self_attentions[0].shape[-4:]), + [ + self.model_tester.num_attention_heads, + encoder_seq_length, + chunk_length, + encoder_key_length, + ], + ) + else: + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + encoder_seq_length, + encoder_key_length, + ], + ) - random.Random().seed(42) + @slow + def test_torchscript(self): ( config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() - random.Random().seed() + self._create_and_check_torchscript(config, inputs_dict) + + @slow + def test_torchscript_output_attentions(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + config.output_attentions = True + self._create_and_check_torchscript(config, inputs_dict) + + @slow + def test_torchscript_output_hidden_state(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + self._create_and_check_torchscript(config, inputs_dict) + + def _create_and_check_torchscript(self, config, inputs_dict): + if not self.test_torchscript: + return - inputs_dict["output_attentions"] = True - config.output_hidden_states = True configs_no_init = _config_zero_init(config) # To be sure we have no Nan + configs_no_init.torchscript = True for model_class in self.all_model_classes: model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() + inputs = self._prepare_for_class(inputs_dict, model_class) - # Prepare head_mask - def prepare_layer_head_mask(i, attention_heads, num_hidden_layers): - if i == 0: - return tf.concat( - ( - tf.zeros(1, dtype=tf.float32), - tf.ones(attention_heads - 1, dtype=tf.float32), - ), - 0, - ) - elif i == num_hidden_layers - 1: - return tf.concat( + try: + if model.config.is_encoder_decoder: + model.config.use_cache = False # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward + input_ids = inputs["input_ids"] + attention_mask = inputs["attention_mask"] + decoder_input_ids = inputs["decoder_input_ids"] + decoder_attention_mask = inputs["decoder_attention_mask"] + traced_model = torch.jit.trace( + model, ( - tf.zeros(attention_heads - 1, dtype=tf.float32), - tf.ones(1, dtype=tf.float32), + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, ), - 0, ) else: - return tf.ones(attention_heads, dtype=tf.float32) - - head_mask = tf.stack( - [ - prepare_layer_head_mask(i, config.num_attention_heads, config.num_hidden_layers) - for i in range(config.num_hidden_layers) - ], - 0, + input_ids = inputs["input_ids"] + traced_model = torch.jit.trace(model, input_ids) + except RuntimeError: + self.fail("Couldn't trace module.") + + with tempfile.TemporaryDirectory() as tmp_dir_name: + pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt") + + try: + torch.jit.save(traced_model, pt_file_name) + except Exception: + self.fail("Couldn't save module.") + + try: + loaded_model = torch.jit.load(pt_file_name) + except Exception: + self.fail("Couldn't load module.") + + model.to(torch_device) + model.eval() + + loaded_model.to(torch_device) + loaded_model.eval() + + model_state_dict = model.state_dict() + loaded_model_state_dict = loaded_model.state_dict() + + non_persistent_buffers = {} + for key in loaded_model_state_dict.keys(): + if key not in model_state_dict.keys(): + non_persistent_buffers[key] = loaded_model_state_dict[key] + + loaded_model_state_dict = { + key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers + } + + self.assertEqual( + set(model_state_dict.keys()), + set(loaded_model_state_dict.keys()), ) + model_buffers = list(model.buffers()) + for non_persistent_buffer in non_persistent_buffers.values(): + found_buffer = False + for i, model_buffer in enumerate(model_buffers): + if torch.equal(non_persistent_buffer, model_buffer): + found_buffer = True + break + + self.assertTrue(found_buffer) + model_buffers.pop(i) + + models_equal = True + for layer_name, p1 in model_state_dict.items(): + if layer_name in loaded_model_state_dict: + p2 = loaded_model_state_dict[layer_name] + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + def test_torch_fx(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + self._create_and_check_torch_fx_tracing(config, inputs_dict) + + def test_torch_fx_output_loss(self): + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + self._create_and_check_torch_fx_tracing(config, inputs_dict, output_loss=True) + + def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False): + if not is_torch_fx_available() or not self.fx_compatible: + return + + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + configs_no_init.return_dict = False + + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss) + + try: + if model.config.is_encoder_decoder: + model.config.use_cache = False # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward + labels = inputs.get("labels", None) + input_names = [ + "input_ids", + "attention_mask", + "decoder_input_ids", + "decoder_attention_mask", + ] + if labels is not None: + input_names.append("labels") + filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names} + + model_output = model(**filtered_inputs) + + traced_model = symbolic_trace(model, input_names) + traced_output = traced_model(**filtered_inputs) + else: + input_names = [ + "input_ids", + "attention_mask", + "token_type_ids", + ] + input_ids = inputs["input_ids"] + + labels = inputs.get("labels", None) + start_positions = inputs.get("start_positions", None) + end_positions = inputs.get("end_positions", None) + if labels is not None: + input_names.append("labels") + if start_positions is not None: + input_names.append("start_positions") + if end_positions is not None: + input_names.append("end_positions") + + filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names} + input_names = filtered_inputs.keys() + + model_output = model(**filtered_inputs) + + rank = len(input_ids.shape) + if rank not in [2, 3]: + raise NotImplementedError( + f"symbolic_trace automatic parameters inference not implemented for input of rank {rank}." + ) + + traced_model = symbolic_trace(model, input_names) + traced_output = traced_model(**filtered_inputs) + + except RuntimeError: + self.fail("Couldn't trace module.") + + def flatten_output(output): + flatten = [] + for x in output: + if isinstance(x, (tuple, list)): + flatten += flatten_output(x) + elif not isinstance(x, torch.Tensor): + continue + else: + flatten.append(x) + return flatten + + model_output = flatten_output(model_output) + traced_output = flatten_output(traced_output) + num_outputs = len(model_output) + + for i in range(num_outputs): + self.assertTrue( + torch.allclose(model_output[i], traced_output[i]), + f"traced {i}th output doesn't match model {i}th output for {model_class}", + ) + + def test_headmasking(self): + if not self.test_head_masking: + return + + global_rng.seed(42) + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + global_rng.seed() + + inputs_dict["output_attentions"] = True + config.output_hidden_states = True + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() + + # Prepare head_mask + # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) + head_mask = torch.ones( + self.model_tester.num_hidden_layers, + self.model_tester.num_attention_heads, + device=torch_device, + ) + head_mask[0, 0] = 0 + head_mask[-1, :-1] = 0 + head_mask.requires_grad_(requires_grad=True) inputs = self._prepare_for_class(inputs_dict, model_class).copy() inputs["head_mask"] = head_mask if model.config.is_encoder_decoder: - signature = inspect.signature(model.call) + signature = inspect.signature(model.forward) arg_names = [*signature.parameters.keys()] if "decoder_head_mask" in arg_names: # necessary diferentiation because of T5 model inputs["decoder_head_mask"] = head_mask if "cross_attn_head_mask" in arg_names: inputs["cross_attn_head_mask"] = head_mask - outputs = model(**inputs, return_dict=True) + # Test that we can get a gradient back for importance score computation + output = sum(t.sum() for t in outputs[0]) + output = output.sum() + output.backward() + multihead_outputs = head_mask.grad + + self.assertIsNotNone(multihead_outputs) + self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers) + def check_attentions_validity(attentions): # Remove Nan for t in attentions: self.assertLess( - (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), - (tf.size(t) / 4).numpy(), + torch.sum(torch.isnan(t)), t.numel() / 4 ) # Check we don't have more than 25% nans (arbitrary) - attentions = [ - tf.where(tf.math.is_nan(t), 0.0, t) for t in attentions + t.masked_fill(torch.isnan(t), 0.0) for t in attentions ] # remove them (the test is less complete) - self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0) - self.assertNotEqual( - tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), - 0.0, - ) - if len(attentions) > 2: # encoder-decodere models have only 2 layers in each modules - self.assertNotEqual( - tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), - 0.0, - ) - self.assertAlmostEqual( - tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), - 0.0, - ) - self.assertNotEqual( - tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), - 0.0, - ) + self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0) + self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0) + if len(attentions) > 2: # encoder-decoder models have only 2 layers in each module + self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0) + self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0) + self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0) if model.config.is_encoder_decoder: check_attentions_validity(outputs.encoder_attentions) check_attentions_validity(outputs.decoder_attentions) - if "cross_attn_head_mask" in arg_names: - check_attentions_validity(outputs.cross_attentions) + check_attentions_validity(outputs.cross_attentions) else: check_attentions_validity(outputs.attentions) - def test_hidden_states_output(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + def test_head_pruning(self): + if not self.test_pruning: + return + + for model_class in self.all_model_classes: + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + if "head_mask" in inputs_dict: + del inputs_dict["head_mask"] + + inputs_dict["output_attentions"] = True + config.output_hidden_states = False + model = model_class(config=config) + model.to(torch_device) + model.eval() + heads_to_prune = { + 0: list(range(1, self.model_tester.num_attention_heads)), + -1: [0], + } + model.prune_heads(heads_to_prune) + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + attentions = outputs[-1] + + self.assertEqual(attentions[0].shape[-3], 1) + self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) + self.assertEqual( + attentions[-1].shape[-3], + self.model_tester.num_attention_heads - 1, + ) + + def test_head_pruning_save_load_from_pretrained(self): + if not self.test_pruning: + return + + for model_class in self.all_model_classes: + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + if "head_mask" in inputs_dict: + del inputs_dict["head_mask"] + + inputs_dict["output_attentions"] = True + config.output_hidden_states = False + model = model_class(config=config) + model.to(torch_device) + model.eval() + heads_to_prune = { + 0: list(range(1, self.model_tester.num_attention_heads)), + -1: [0], + } + model.prune_heads(heads_to_prune) + + with tempfile.TemporaryDirectory() as temp_dir_name: + model.save_pretrained(temp_dir_name) + model = model_class.from_pretrained(temp_dir_name) + model.to(torch_device) + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs[-1] + self.assertEqual(attentions[0].shape[-3], 1) + self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) + self.assertEqual( + attentions[-1].shape[-3], + self.model_tester.num_attention_heads - 1, + ) + + def test_head_pruning_save_load_from_config_init(self): + if not self.test_pruning: + return + + for model_class in self.all_model_classes: + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + if "head_mask" in inputs_dict: + del inputs_dict["head_mask"] + + inputs_dict["output_attentions"] = True + config.output_hidden_states = False + + heads_to_prune = { + 0: list(range(1, self.model_tester.num_attention_heads)), + -1: [0], + } + config.pruned_heads = heads_to_prune + + model = model_class(config=config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs[-1] + + self.assertEqual(attentions[0].shape[-3], 1) + self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) + self.assertEqual( + attentions[-1].shape[-3], + self.model_tester.num_attention_heads - 1, + ) + + def test_head_pruning_integration(self): + if not self.test_pruning: + return + + for model_class in self.all_model_classes: + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + if "head_mask" in inputs_dict: + del inputs_dict["head_mask"] + + inputs_dict["output_attentions"] = True + config.output_hidden_states = False + + heads_to_prune = {0: [0], 1: [1, 2]} + config.pruned_heads = heads_to_prune + + model = model_class(config=config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs[-1] + + self.assertEqual( + attentions[0].shape[-3], + self.model_tester.num_attention_heads - 1, + ) + self.assertEqual( + attentions[1].shape[-3], + self.model_tester.num_attention_heads - 2, + ) + self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads) + self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) + + with tempfile.TemporaryDirectory() as temp_dir_name: + model.save_pretrained(temp_dir_name) + model = model_class.from_pretrained(temp_dir_name) + model.to(torch_device) + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs[-1] + + self.assertEqual( + attentions[0].shape[-3], + self.model_tester.num_attention_heads - 1, + ) + self.assertEqual( + attentions[1].shape[-3], + self.model_tester.num_attention_heads - 2, + ) + self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads) + self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) - def check_hidden_states_output(config, inputs_dict, model_class): + heads_to_prune = {0: [0], 2: [1, 2]} + model.prune_heads(heads_to_prune) + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs[-1] + + self.assertEqual( + attentions[0].shape[-3], + self.model_tester.num_attention_heads - 1, + ) + self.assertEqual( + attentions[1].shape[-3], + self.model_tester.num_attention_heads - 2, + ) + self.assertEqual( + attentions[2].shape[-3], + self.model_tester.num_attention_heads - 2, + ) + self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) + + self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]}) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) - outputs = model(self._prepare_for_class(inputs_dict, model_class)) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + expected_num_layers = getattr( self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1, ) + self.assertEqual(len(hidden_states), expected_num_layers) - if model.config.is_encoder_decoder: - encoder_hidden_states = outputs.encoder_hidden_states - decoder_hidden_states = outputs.decoder_hidden_states - - self.assertEqual(config.output_attentions, False) - self.assertEqual(len(encoder_hidden_states), expected_num_layers) - self.assertListEqual( - list(encoder_hidden_states[0].shape[-2:]), - [ - self.model_tester.seq_length, - self.model_tester.hidden_size, - ], - ) - self.assertEqual(len(decoder_hidden_states), expected_num_layers) - self.assertListEqual( - list(decoder_hidden_states[0].shape[-2:]), - [ - self.model_tester.seq_length, - self.model_tester.hidden_size, - ], - ) + if hasattr(self.model_tester, "encoder_seq_length"): + seq_length = self.model_tester.encoder_seq_length + if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1: + seq_length = seq_length * self.model_tester.chunk_length else: - hidden_states = outputs.hidden_states - self.assertEqual(config.output_attentions, False) + seq_length = self.model_tester.seq_length + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_length, self.model_tester.hidden_size], + ) + + if config.is_encoder_decoder: + hidden_states = outputs.decoder_hidden_states + + self.assertIsInstance(hidden_states, (list, tuple)) self.assertEqual(len(hidden_states), expected_num_layers) + seq_len = getattr(self.model_tester, "seq_length", None) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) + self.assertListEqual( list(hidden_states[0].shape[-2:]), - [ - self.model_tester.seq_length, - self.model_tester.hidden_size, - ], + [decoder_seq_length, self.model_tester.hidden_size], ) + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True - check_hidden_states_output(config, inputs_dict, model_class) + check_hidden_states_output(inputs_dict, config, model_class) + # check that output_hidden_states also work using config del inputs_dict["output_hidden_states"] config.output_hidden_states = True - check_hidden_states_output(config, inputs_dict, model_class) - def test_model_common_attributes(self): + check_hidden_states_output(inputs_dict, config, model_class) + + def test_retain_grad_hidden_states_attentions(self): ( config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() - text_in_text_out_models = ( - get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING) - + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING) - + get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING) - ) - speech_in_text_out_models = get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING) + config.output_hidden_states = True + config.output_attentions = True + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs = model(**inputs) + + output = outputs[0] + + if config.is_encoder_decoder: + # Seq2Seq models + encoder_hidden_states = outputs.encoder_hidden_states[0] + encoder_attentions = outputs.encoder_attentions[0] + encoder_hidden_states.retain_grad() + encoder_attentions.retain_grad() + + decoder_hidden_states = outputs.decoder_hidden_states[0] + decoder_attentions = outputs.decoder_attentions[0] + decoder_hidden_states.retain_grad() + decoder_attentions.retain_grad() + cross_attentions = outputs.cross_attentions[0] + cross_attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(encoder_hidden_states.grad) + self.assertIsNotNone(encoder_attentions.grad) + self.assertIsNotNone(decoder_hidden_states.grad) + self.assertIsNotNone(decoder_attentions.grad) + self.assertIsNotNone(cross_attentions.grad) + else: + # Encoder-/Decoder-only models + hidden_states = outputs.hidden_states[0] + attentions = outputs.attentions[0] + + hidden_states.retain_grad() + attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(hidden_states.grad) + self.assertIsNotNone(attentions.grad) + + def test_feed_forward_chunking(self): + ( + original_config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: + torch.manual_seed(0) + config = copy.deepcopy(original_config) model = model_class(config) - assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer) - if model_class in text_in_text_out_models: - x = model.get_output_embeddings() - assert isinstance(x, tf.keras.layers.Layer) - name = model.get_bias() - assert isinstance(name, dict) - for k, v in name.items(): - assert isinstance(v, tf.Variable) - elif model_class in speech_in_text_out_models: - x = model.get_output_embeddings() - assert isinstance(x, tf.keras.layers.Layer) - name = model.get_bias() - assert name is None + model.to(torch_device) + model.eval() + + hidden_states_no_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0] + + torch.manual_seed(0) + config.chunk_size_feed_forward = 1 + model = model_class(config) + model.to(torch_device) + model.eval() + + hidden_states_with_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0] + self.assertTrue(torch.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3)) + + def test_resize_position_vector_embeddings(self): + if not self.test_resize_position_embeddings: + return + + ( + original_config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config) + model.to(torch_device) + + if self.model_tester.is_training is False: + model.eval() + + max_position_embeddings = config.max_position_embeddings + + # Retrieve the embeddings and clone theme + if model.config.is_encoder_decoder: + ( + encoder_model_embed, + decoder_model_embed, + ) = model.get_position_embeddings() + encoder_cloned_embeddings = encoder_model_embed.weight.clone() + decoder_cloned_embeddings = decoder_model_embed.weight.clone() else: - x = model.get_output_embeddings() - assert x is None - name = model.get_bias() - assert name is None + model_embed = model.get_position_embeddings() + cloned_embeddings = model_embed.weight.clone() + + # Check that resizing the position embeddings with a larger max_position_embeddings increases + # the model's postion embeddings size + model.resize_position_embeddings(max_position_embeddings + 10) + self.assertEqual( + model.config.max_position_embeddings, + max_position_embeddings + 10, + ) - def test_determinism(self): + # Check that it actually resizes the embeddings matrix + if model.config.is_encoder_decoder: + ( + encoder_model_embed, + decoder_model_embed, + ) = model.get_position_embeddings() + self.assertEqual( + encoder_model_embed.weight.shape[0], + encoder_cloned_embeddings.shape[0] + 10, + ) + self.assertEqual( + decoder_model_embed.weight.shape[0], + decoder_cloned_embeddings.shape[0] + 10, + ) + else: + model_embed = model.get_position_embeddings() + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the position embeddings with a smaller max_position_embeddings decreases + # the model's max_position_embeddings + model.resize_position_embeddings(max_position_embeddings - 5) + self.assertEqual( + model.config.max_position_embeddings, + max_position_embeddings - 5, + ) + + # Check that it actually resizes the embeddings matrix + if model.config.is_encoder_decoder: + ( + encoder_model_embed, + decoder_model_embed, + ) = model.get_position_embeddings() + self.assertEqual( + encoder_model_embed.weight.shape[0], + encoder_cloned_embeddings.shape[0] - 5, + ) + self.assertEqual( + decoder_model_embed.weight.shape[0], + decoder_cloned_embeddings.shape[0] - 5, + ) + else: + model_embed = model.get_position_embeddings() + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 5) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that adding and removing tokens has not modified the first part of the embedding matrix. + models_equal = True + + if model.config.is_encoder_decoder: + for p1, p2 in zip(encoder_cloned_embeddings, encoder_model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + for p1, p2 in zip(decoder_cloned_embeddings, decoder_model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + else: + for p1, p2 in zip(cloned_embeddings, model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + def test_resize_tokens_embeddings(self): + ( + original_config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + if not self.test_resize_embeddings: + return + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config) + model.to(torch_device) + + if self.model_tester.is_training is False: + model.eval() + + model_vocab_size = config.vocab_size + # Retrieve the embeddings and clone theme + model_embed = model.resize_token_embeddings(model_vocab_size) + cloned_embeddings = model_embed.weight.clone() + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.vocab_size, model_vocab_size + 10) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1) + + # make sure that decoder_input_ids are resized as well + if "decoder_input_ids" in inputs_dict: + inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that adding and removing tokens has not modified the first part of the embedding matrix. + models_equal = True + for p1, p2 in zip(cloned_embeddings, model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + def test_resize_embeddings_untied(self): + ( + original_config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + if not self.test_resize_embeddings: + return + + original_config.tie_word_embeddings = False + + # if model cannot untied embeddings -> leave test + if original_config.tie_word_embeddings: + return + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config).to(torch_device) + + # if no output embeddings -> leave test + if model.get_output_embeddings() is None: + continue + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_vocab_size = config.vocab_size + model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.vocab_size, model_vocab_size + 10) + output_embeds = model.get_output_embeddings() + self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10) + # Check bias if present + if output_embeds.bias is not None: + self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + output_embeds = model.get_output_embeddings() + self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15) + # Check bias if present + if output_embeds.bias is not None: + self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1) + if "decoder_input_ids" in inputs_dict: + inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + def test_model_common_attributes(self): ( config, inputs_dict, @@ -871,50 +1484,127 @@ def test_determinism(self): for model_class in self.all_model_classes: model = model_class(config) - first, second = ( - model( - self._prepare_for_class(inputs_dict, model_class), - training=False, - )[0], - model( - self._prepare_for_class(inputs_dict, model_class), - training=False, - )[0], - ) - out_1 = first.numpy() - out_2 = second.numpy() - out_1 = out_1[~np.isnan(out_1)] - out_2 = out_2[~np.isnan(out_2)] - max_diff = np.amax(np.abs(out_1 - out_2)) - self.assertLessEqual(max_diff, 1e-5) + self.assertIsInstance(model.get_input_embeddings(), (nn.Embedding, AdaptiveEmbedding)) + model.set_input_embeddings(nn.Embedding(10, 10)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_model_main_input_name(self): + for model_class in self.all_model_classes: + model_signature = inspect.signature(getattr(model_class, "forward")) + # The main input is the name of the argument after `self` + observed_main_input_name = list(model_signature.parameters.keys())[1] + self.assertEqual(model_class.main_input_name, observed_main_input_name) + + def test_correct_missing_keys(self): + if not self.test_missing_keys: + return + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + base_model_prefix = model.base_model_prefix + + if hasattr(model, base_model_prefix): + with tempfile.TemporaryDirectory() as temp_dir_name: + model.base_model.save_pretrained(temp_dir_name) + model, loading_info = model_class.from_pretrained(temp_dir_name, output_loading_info=True) + with self.subTest(msg=f"Missing keys for {model.__class__.__name__}"): + self.assertGreater(len(loading_info["missing_keys"]), 0) + + def test_tie_model_weights(self): + if not self.test_torchscript: + return + + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + def check_same_values(layer_1, layer_2): + equal = True + for p1, p2 in zip(layer_1.weight, layer_2.weight): + if p1.data.ne(p2.data).sum() > 0: + equal = False + return equal + + for model_class in self.all_model_classes: + config.torchscript = True + model_not_tied = model_class(config) + if model_not_tied.get_output_embeddings() is None: + continue - def test_model_outputs_equivalence(self): + config_tied = copy.deepcopy(config) + config_tied.torchscript = False + model_tied = model_class(config_tied) + params_tied = list(model_tied.parameters()) + # Check that the embedding layer and decoding layer are the same in size and in value + # self.assertTrue(check_same_values(embeddings, decoding)) + + # # Check that after modification, they remain the same. + # embeddings.weight.data.div_(2) + # # Check that the embedding layer and decoding layer are the same in size and in value + # self.assertTrue(embeddings.weight.shape, decoding.weight.shape) + # self.assertTrue(check_same_values(embeddings, decoding)) + + # # Check that after modification, they remain the same. + # decoding.weight.data.div_(4) + # # Check that the embedding layer and decoding layer are the same in size and in value + # self.assertTrue(embeddings.weight.shape, decoding.weight.shape) + # self.assertTrue(check_same_values(embeddings, decoding)) + + # Check that after resize they remain tied. + model_tied.resize_token_embeddings(config.vocab_size + 10) + params_tied_2 = list(model_tied.parameters()) + self.assertEqual(len(params_tied_2), len(params_tied)) + + # decoding.weight.data.mul_(20) + # # Check that the embedding layer and decoding layer are the same in size and in value + # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape) + # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head)) + def test_model_outputs_equivalence(self): ( config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() + def set_nan_tensor_to_zero(t): + t[t != t] = 0 + return t + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): - tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs) - dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple() - - def recursive_check(tuple_object, dict_object): - if isinstance(tuple_object, (List, Tuple)): - for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): - recursive_check(tuple_iterable_value, dict_iterable_value) - elif tuple_object is None: - return - else: - self.assertTrue( - all(tf.equal(tuple_object, dict_object)), - msg=f"Tuple and dict output are not equal. Difference: {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}", - ) + with torch.no_grad(): + tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (List, Tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif isinstance(tuple_object, Dict): + for tuple_iterable_value, dict_iterable_value in zip( + tuple_object.values(), dict_object.values() + ): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + torch.allclose( + set_nan_tensor_to_zero(tuple_object), + set_nan_tensor_to_zero(dict_object), + atol=1e-5, + ), + msg=f"Tuple and dict output are not equal. Difference: {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`: {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}.", + ) recursive_check(tuple_output, dict_output) for model_class in self.all_model_classes: model = model_class(config) + model.to(torch_device) + model.eval() tuple_inputs = self._prepare_for_class(inputs_dict, model_class) dict_inputs = self._prepare_for_class(inputs_dict, model_class) @@ -949,434 +1639,523 @@ def recursive_check(tuple_object, dict_object): {"output_hidden_states": True, "output_attentions": True}, ) - def test_inputs_embeds(self): + @is_pt_tf_cross_test + def test_pt_tf_model_equivalence(self): + import numpy as np + import tensorflow as tf + + import transformers + ( config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - model = model_class(config) + tf_model_class_name = "TF" + model_class.__name__ # Add the "TF" at the beginning - inputs = copy.deepcopy(inputs_dict) + if not hasattr(transformers, tf_model_class_name): + # transformers does not have TF version yet + return - if not self.is_encoder_decoder: - input_ids = inputs["input_ids"] - del inputs["input_ids"] - else: - encoder_input_ids = inputs["input_ids"] - decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids) - del inputs["input_ids"] - inputs.pop("decoder_input_ids", None) + tf_model_class = getattr(transformers, tf_model_class_name) - if not self.is_encoder_decoder: - inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids) - else: - inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids) - inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids) + config.output_hidden_states = True - inputs = self._prepare_for_class(inputs, model_class) + tf_model = tf_model_class(config) + pt_model = model_class(config) - model(inputs) + # make sure only tf inputs are forward that actually exist in function args + tf_input_keys = set(inspect.signature(tf_model.call).parameters.keys()) - def test_numpy_arrays_inputs(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + # remove all head masks + tf_input_keys.discard("head_mask") + tf_input_keys.discard("cross_attn_head_mask") + tf_input_keys.discard("decoder_head_mask") - def prepare_numpy_arrays(inputs_dict): - inputs_np_dict = {} - for k, v in inputs_dict.items(): - if tf.is_tensor(v): - inputs_np_dict[k] = v.numpy() + pt_inputs = self._prepare_for_class(inputs_dict, model_class) + pt_inputs = {k: v for k, v in pt_inputs.items() if k in tf_input_keys} + + # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences + pt_model.eval() + tf_inputs_dict = {} + for key, tensor in pt_inputs.items(): + # skip key that does not exist in tf + if type(tensor) == bool: + tf_inputs_dict[key] = tensor + elif key == "input_values": + tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32) + elif key == "pixel_values": + tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32) + elif key == "input_features": + tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32) else: - inputs_np_dict[k] = np.array(k) + tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32) - return inputs_np_dict + # Check we can load pt model in tf and vice-versa with model => model functions + tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict) + pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model).to(torch_device) - for model_class in self.all_model_classes: - model = model_class(config) + # need to rename encoder-decoder "inputs" for PyTorch + # if "inputs" in pt_inputs_dict and self.is_encoder_decoder: + # pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") - inputs = self._prepare_for_class(inputs_dict, model_class) - inputs_np = prepare_numpy_arrays(inputs) + with torch.no_grad(): + pto = pt_model(**pt_inputs) + tfo = tf_model(tf_inputs_dict, training=False) - output_for_dict_input = model(inputs_np) - output_for_kw_input = model(**inputs_np) - self.assert_outputs_same(output_for_dict_input, output_for_kw_input) + tf_hidden_states = tfo[0].numpy() + pt_hidden_states = pto[0].cpu().numpy() - def test_resize_token_embeddings(self): - if not self.test_resize_embeddings: - return + tf_nans = np.copy(np.isnan(tf_hidden_states)) + pt_nans = np.copy(np.isnan(pt_hidden_states)) + + pt_hidden_states[tf_nans] = 0 + tf_hidden_states[tf_nans] = 0 + pt_hidden_states[pt_nans] = 0 + tf_hidden_states[pt_nans] = 0 + + max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states)) + self.assertLessEqual(max_diff, 4e-2) + + # Check we can load pt model in tf and vice-versa with checkpoint => model functions + with tempfile.TemporaryDirectory() as tmpdirname: + pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") + torch.save(pt_model.state_dict(), pt_checkpoint_path) + tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) + + tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") + tf_model.save_weights(tf_checkpoint_path) + pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) + pt_model = pt_model.to(torch_device) + + # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences + pt_model.eval() + tf_inputs_dict = {} + for key, tensor in pt_inputs.items(): + # skip key that does not exist in tf + if type(tensor) == bool: + tensor = np.array(tensor, dtype=bool) + tf_inputs_dict[key] = tf.convert_to_tensor(tensor, dtype=tf.int32) + elif key == "input_values": + tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32) + elif key == "pixel_values": + tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32) + elif key == "input_features": + tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32) + else: + tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32) + + # need to rename encoder-decoder "inputs" for PyTorch + # if "inputs" in pt_inputs_dict and self.is_encoder_decoder: + # pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") + + with torch.no_grad(): + pto = pt_model(**pt_inputs) + + tfo = tf_model(tf_inputs_dict) + tfo = tfo[0].numpy() + pto = pto[0].cpu().numpy() + tf_nans = np.copy(np.isnan(tfo)) + pt_nans = np.copy(np.isnan(pto)) + + pto[tf_nans] = 0 + tfo[tf_nans] = 0 + pto[pt_nans] = 0 + tfo[pt_nans] = 0 + + max_diff = np.amax(np.abs(tfo - pto)) + self.assertLessEqual(max_diff, 4e-2) + + def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float): + diff = np.abs((a - b)).max() + self.assertLessEqual( + diff, + tol, + f"Difference between torch and flax is {diff} (>= {tol}).", + ) + + @is_pt_flax_cross_test + def test_equivalence_pt_to_flax(self): ( config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() - def _get_word_embedding_weight(model, embedding_layer): - embeds = getattr(embedding_layer, "weight", None) - if embeds is not None: - return embeds + for model_class in self.all_model_classes: + with self.subTest(model_class.__name__): - embeds = getattr(embedding_layer, "decoder", None) - if embeds is not None: - return embeds + # load PyTorch class + pt_model = model_class(config).eval() + # Flax models don't use the `use_cache` option and cache is not returned as a default. + # So we disable `use_cache` here for PyTorch model. + pt_model.config.use_cache = False - model(model.dummy_inputs) + fx_model_class_name = "Flax" + model_class.__name__ - embeds = getattr(embedding_layer, "weight", None) - if embeds is not None: - return embeds + if not hasattr(transformers, fx_model_class_name): + return - embeds = getattr(embedding_layer, "decoder", None) - if embeds is not None: - return embeds + fx_model_class = getattr(transformers, fx_model_class_name) - return None + # load Flax class + fx_model = fx_model_class(config, dtype=jnp.float32) + # make sure only flax inputs are forward that actually exist in function args + fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys() - for model_class in self.all_model_classes: - for size in [config.vocab_size - 10, config.vocab_size + 10, None]: - # build the embeddings - model = model_class(config=config) - old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings()) - old_bias = model.get_bias() - old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings()) - # reshape the embeddings - model.resize_token_embeddings(size) - new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings()) - new_bias = model.get_bias() - new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings()) - - # check that the resized embeddings size matches the desired size. - assert_size = size if size is not None else config.vocab_size - self.assertEqual(new_input_embeddings.shape[0], assert_size) - - # check that weights remain the same after resizing - models_equal = True - for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()): - if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: - models_equal = False - self.assertTrue(models_equal) - - if old_bias is not None and new_bias is not None: - for old_weight, new_weight in zip(old_bias.values(), new_bias.values()): - self.assertEqual(new_weight.shape[0], assert_size) - - models_equal = True - for p1, p2 in zip(old_weight.value(), new_weight.value()): - if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: - models_equal = False - self.assertTrue(models_equal) - - if old_output_embeddings is not None and new_output_embeddings is not None: - self.assertEqual(new_output_embeddings.shape[0], assert_size) - self.assertEqual( - new_output_embeddings.shape[1], - old_output_embeddings.shape[1], - ) + # prepare inputs + pt_inputs = self._prepare_for_class(inputs_dict, model_class) + + # remove function args that don't exist in Flax + pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys} + + fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model) + fx_model.params = fx_state - models_equal = True - for p1, p2 in zip( - old_output_embeddings.value(), - new_output_embeddings.value(), - ): - if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: - models_equal = False - self.assertTrue(models_equal) + with torch.no_grad(): + pt_outputs = pt_model(**pt_inputs).to_tuple() - def test_lm_head_model_random_no_beam_search_generate(self): + # convert inputs to Flax + fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)} + fx_outputs = fx_model(**fx_inputs).to_tuple() + self.assertEqual( + len(fx_outputs), + len(pt_outputs), + "Output lengths differ between Flax and PyTorch", + ) + for fx_output, pt_output in zip(fx_outputs, pt_outputs): + self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2) + + with tempfile.TemporaryDirectory() as tmpdirname: + pt_model.save_pretrained(tmpdirname) + fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, from_pt=True) + + fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple() + self.assertEqual( + len(fx_outputs_loaded), + len(pt_outputs), + "Output lengths differ between Flax and PyTorch", + ) + for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs): + self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2) + + @is_pt_flax_cross_test + def test_equivalence_flax_to_pt(self): ( config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() - input_ids = inputs_dict.get("input_ids", None) - # iterate over all generative models - for model_class in self.all_generative_model_classes: - model = model_class(config) + for model_class in self.all_model_classes: + with self.subTest(model_class.__name__): + # load corresponding PyTorch class + pt_model = model_class(config).eval() - if config.bos_token_id is None: - # if bos token id is not defined model needs input_ids - with self.assertRaises(AssertionError): - model.generate(do_sample=True, max_length=5) - # num_return_sequences = 1 - self._check_generated_ids(model.generate(input_ids, do_sample=True)) - elif model_class.__name__ not in ["TFSpeech2TextForConditionalGeneration"]: - # Models with non-text inputs won't work here; num_return_sequences = 1 - self._check_generated_ids(model.generate(do_sample=True, max_length=5)) - - with self.assertRaises(ValueError): - # generating multiple sequences when no beam search generation - # is not allowed as it would always generate the same sequences - model.generate(input_ids, do_sample=False, num_return_sequences=2) - - # num_return_sequences > 1, sample - self._check_generated_ids(model.generate(input_ids, do_sample=True, num_return_sequences=2)) - - # check bad words tokens language generation - # create list of 1-seq bad token and list of 2-seq of bad tokens - bad_words_ids = [ - self._generate_random_bad_tokens(1, model), - self._generate_random_bad_tokens(2, model), - ] - output_tokens = model.generate( - input_ids, - do_sample=True, - bad_words_ids=bad_words_ids, - num_return_sequences=2, - ) - # only count generated tokens - generated_ids = output_tokens[:, input_ids.shape[-1] :] - self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) + # So we disable `use_cache` here for PyTorch model. + pt_model.config.use_cache = False + + fx_model_class_name = "Flax" + model_class.__name__ + + if not hasattr(transformers, fx_model_class_name): + # no flax model exists for this class + return + + fx_model_class = getattr(transformers, fx_model_class_name) + + # load Flax class + fx_model = fx_model_class(config, dtype=jnp.float32) + # make sure only flax inputs are forward that actually exist in function args + fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys() + + pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params) + + # make sure weights are tied in PyTorch + pt_model.tie_weights() + + # prepare inputs + pt_inputs = self._prepare_for_class(inputs_dict, model_class) + + # remove function args that don't exist in Flax + pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys} + + with torch.no_grad(): + pt_outputs = pt_model(**pt_inputs).to_tuple() + + fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)} + + fx_outputs = fx_model(**fx_inputs).to_tuple() + self.assertEqual( + len(fx_outputs), + len(pt_outputs), + "Output lengths differ between Flax and PyTorch", + ) + + for fx_output, pt_output in zip(fx_outputs, pt_outputs): + self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2) - def test_lm_head_model_no_beam_search_generate_dict_outputs(self): + with tempfile.TemporaryDirectory() as tmpdirname: + fx_model.save_pretrained(tmpdirname) + pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True) + + with torch.no_grad(): + pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple() + + self.assertEqual( + len(fx_outputs), + len(pt_outputs_loaded), + "Output lengths differ between Flax and PyTorch", + ) + for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded): + self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2) + + def test_inputs_embeds(self): ( config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() - input_ids = inputs_dict.get("input_ids", None) - if input_ids is None: - input_ids = inputs_dict.get("input_features", None) - # iterate over all generative models - for model_class in self.all_generative_model_classes: + for model_class in self.all_model_classes: model = model_class(config) - output_greedy = model.generate( - input_ids, - do_sample=False, - output_scores=True, - output_hidden_states=True, - output_attentions=True, - return_dict_in_generate=True, - ) - output_sample = model.generate( - input_ids, - do_sample=True, - output_scores=True, - output_hidden_states=True, - output_attentions=True, - return_dict_in_generate=True, - ) + model.to(torch_device) + model.eval() - if model.config.is_encoder_decoder: - self.assertIsInstance(output_greedy, TFGreedySearchEncoderDecoderOutput) - self.assertIsInstance(output_sample, TFSampleEncoderDecoderOutput) + inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + + if not self.is_encoder_decoder: + input_ids = inputs["input_ids"] + del inputs["input_ids"] + else: + encoder_input_ids = inputs["input_ids"] + decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids) + del inputs["input_ids"] + inputs.pop("decoder_input_ids", None) + + wte = model.get_input_embeddings() + if not self.is_encoder_decoder: + inputs["inputs_embeds"] = wte(input_ids) else: - self.assertIsInstance(output_greedy, TFGreedySearchDecoderOnlyOutput) - self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput) + inputs["inputs_embeds"] = wte(encoder_input_ids) + inputs["decoder_inputs_embeds"] = wte(decoder_input_ids) - def test_lm_head_model_random_beam_search_generate(self): + with torch.no_grad(): + model(**inputs)[0] + + @require_torch_multi_gpu + def test_multi_gpu_data_parallel_forward(self): ( config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() - input_ids = inputs_dict.get("input_ids", None) - for model_class in self.all_generative_model_classes: + # some params shouldn't be scattered by nn.DataParallel + # so just remove them if they are present. + blacklist_non_batched_params = [ + "head_mask", + "decoder_head_mask", + "cross_attn_head_mask", + ] + for k in blacklist_non_batched_params: + inputs_dict.pop(k, None) + + # move input tensors to cuda:O + for k, v in inputs_dict.items(): + if torch.is_tensor(v): + inputs_dict[k] = v.to(0) + + for model_class in self.all_model_classes: + model = model_class(config=config) + model.to(0) + model.eval() + + # Wrap model in nn.DataParallel + model = nn.DataParallel(model) + with torch.no_grad(): + _ = model(**self._prepare_for_class(inputs_dict, model_class)) + + @require_torch_multi_gpu + def test_model_parallelization(self): + if not self.test_model_parallel: + return + + # a candidate for testing_utils + def get_current_gpu_memory_use(): + """returns a list of cuda memory allocations per GPU in MBs""" + + per_device_memory = [] + for id in range(torch.cuda.device_count()): + with torch.cuda.device(id): + per_device_memory.append(torch.cuda.memory_allocated() >> 20) + + return per_device_memory + + # Needs a large model to see the difference. + config = self.model_tester.get_large_model_config() + + for model_class in self.all_parallelizable_model_classes: + torch.cuda.empty_cache() + + # 1. single gpu memory load + unload + memory measurements + # Retrieve initial memory usage (can easily be ~0.6-1.5GB if cuda-kernels have been preloaded by previous tests) + memory_at_start = get_current_gpu_memory_use() + + # Put model on device 0 and take a memory snapshot model = model_class(config) + model.to("cuda:0") + memory_after_model_load = get_current_gpu_memory_use() - if config.bos_token_id is None: - # if bos token id is not defined model needs input_ids, num_return_sequences = 1 - self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2)) - else: - # num_return_sequences = 1 - self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2)) - - with self.assertRaises(AssertionError): - # generating more sequences than having beams leads is not possible - model.generate( - input_ids, - do_sample=False, - num_return_sequences=3, - num_beams=2, - ) + # The memory use on device 0 should be higher than it was initially. + self.assertGreater(memory_after_model_load[0], memory_at_start[0]) - # num_return_sequences > 1, sample - self._check_generated_ids( - model.generate( - input_ids, - do_sample=True, - num_beams=2, - num_return_sequences=2, - ) - ) - # num_return_sequences > 1, greedy - self._check_generated_ids( - model.generate( - input_ids, - do_sample=False, - num_beams=2, - num_return_sequences=2, - ) - ) + del model + gc.collect() + torch.cuda.empty_cache() - # check bad words tokens language generation - # create list of 1-seq bad token and list of 2-seq of bad tokens - bad_words_ids = [ - self._generate_random_bad_tokens(1, model), - self._generate_random_bad_tokens(2, model), - ] - output_tokens = model.generate( - input_ids, - do_sample=False, - bad_words_ids=bad_words_ids, - num_beams=2, - num_return_sequences=2, - ) - # only count generated tokens - generated_ids = output_tokens[:, input_ids.shape[-1] :] - self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) + # 2. MP test + # it's essential to re-calibrate the usage before the next stage + memory_at_start = get_current_gpu_memory_use() + + # Spread model layers over multiple devices + model = model_class(config) + model.parallelize() + memory_after_parallelization = get_current_gpu_memory_use() + + # Assert that the memory use on all devices is higher than it was when loaded only on CPU + for n in range(torch.cuda.device_count()): + self.assertGreater(memory_after_parallelization[n], memory_at_start[n]) + + # Assert that the memory use of device 0 is lower than it was when the entire model was loaded on it + self.assertLess(memory_after_parallelization[0], memory_after_model_load[0]) + + # Assert that the memory use of device 1 is higher than it was when the entire model was loaded + # on device 0 and device 1 wasn't used at all + self.assertGreater(memory_after_parallelization[1], memory_after_model_load[1]) + + del model + gc.collect() + torch.cuda.empty_cache() + + @require_torch_multi_gpu + def test_model_parallel_equal_results(self): + if not self.test_model_parallel: + return - def test_lm_head_model_beam_search_generate_dict_outputs(self): ( config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() - input_ids = inputs_dict.get("input_ids", None) - if input_ids is None: - input_ids = inputs_dict.get("input_features", None) - # iterate over all generative models - for model_class in self.all_generative_model_classes: + for model_class in self.all_parallelizable_model_classes: + inputs_dict = self._prepare_for_class(inputs_dict, model_class) + + def cast_to_device(dictionary, device): + output = {} + for k, v in dictionary.items(): + if isinstance(v, torch.Tensor): + output[k] = v.to(device) + else: + output[k] = v + + return output + model = model_class(config) - output_beam_search = model.generate( - input_ids, - num_beams=2, - do_sample=False, - output_scores=True, - output_hidden_states=True, - output_attentions=True, - return_dict_in_generate=True, - ) - output_beam_sample = model.generate( - input_ids, - num_beams=2, - do_sample=True, - output_scores=True, - output_hidden_states=True, - output_attentions=True, - return_dict_in_generate=True, - ) + output = model(**cast_to_device(inputs_dict, "cpu")) - if model.config.is_encoder_decoder: - self.assertIsInstance(output_beam_search, TFBeamSearchEncoderDecoderOutput) - self.assertIsInstance(output_beam_sample, TFBeamSampleEncoderDecoderOutput) - else: - self.assertIsInstance(output_beam_search, TFBeamSearchDecoderOnlyOutput) - self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput) + model.parallelize() + + parallel_output = model(**cast_to_device(inputs_dict, "cuda:0")) + + for value, parallel_value in zip(output, parallel_output): + if isinstance(value, torch.Tensor): + self.assertTrue(torch.allclose(value, parallel_value.to("cpu"), atol=1e-7)) + elif isinstance(value, (Tuple, List)): + for value_, parallel_value_ in zip(value, parallel_value): + self.assertTrue(torch.allclose(value_, parallel_value_.to("cpu"), atol=1e-7)) + + @require_torch_multi_gpu + def test_model_parallel_beam_search(self): + if not self.test_model_parallel: + return + + all_generative_and_parallelizable_model_classes = tuple( + set(self.all_generative_model_classes).intersection(self.all_parallelizable_model_classes) + ) - def test_loss_computation(self): ( config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: + + for model_class in all_generative_and_parallelizable_model_classes: + inputs_dict = self._prepare_for_class(inputs_dict, model_class) model = model_class(config) - if getattr(model, "hf_compute_loss", None): - # The number of elements in the loss should be the same as the number of elements in the label - prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) - added_label = prepared_for_class[ - sorted( - list(prepared_for_class.keys() - inputs_dict.keys()), - reverse=True, - )[0] - ] - loss_size = tf.size(added_label) - if model.__class__ in get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING): - # if loss is causal lm loss, labels are shift, so that one label per batch - # is cut - loss_size = loss_size - self.model_tester.batch_size + def cast_to_device(dictionary, device): + output = {} + for k, v in dictionary.items(): + if isinstance(v, torch.Tensor): + output[k] = v.to(device) + else: + output[k] = v - # Test that model correctly compute the loss with kwargs - prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) - possible_input_names = { - "input_ids", - "pixel_values", - "input_features", - } - input_name = possible_input_names.intersection(set(prepared_for_class)).pop() - model_input = prepared_for_class.pop(input_name) - - loss = model(model_input, **prepared_for_class)[0] - self.assertEqual(loss.shape, [loss_size]) - - # Test that model correctly compute the loss with a dict - prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) - loss = model(prepared_for_class)[0] - self.assertEqual(loss.shape, [loss_size]) - - # Test that model correctly compute the loss with a tuple - prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) - - # Get keys that were added with the _prepare_for_class function - label_keys = prepared_for_class.keys() - inputs_dict.keys() - signature = inspect.signature(model.call).parameters - signature_names = list(signature.keys()) - - # Create a dictionary holding the location of the tensors in the tuple - tuple_index_mapping = {0: input_name} - for label_key in label_keys: - label_key_index = signature_names.index(label_key) - tuple_index_mapping[label_key_index] = label_key - sorted_tuple_index_mapping = sorted(tuple_index_mapping.items()) - # Initialize a list with their default values, update the values and convert to a tuple - list_input = [] - - for name in signature_names: - if name != "kwargs": - list_input.append(signature[name].default) - - for index, value in sorted_tuple_index_mapping: - list_input[index] = prepared_for_class[value] - - tuple_input = tuple(list_input) - - # Send to model - loss = model(tuple_input[:-1])[0] - - self.assertEqual(loss.shape, [loss_size]) - - def test_generate_with_headmasking(self): - attention_names = [ - "encoder_attentions", - "decoder_attentions", - "cross_attentions", - ] + return output + + model.parallelize() + model.generate(**cast_to_device(inputs_dict, "cuda:0"), num_beams=2) + + def test_problem_types(self): ( config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_generative_model_classes: - model = model_class(config) + problem_types = [ + { + "title": "multi_label_classification", + "num_labels": 2, + "dtype": torch.float, + }, + { + "title": "single_label_classification", + "num_labels": 1, + "dtype": torch.long, + }, + {"title": "regression", "num_labels": 1, "dtype": torch.float}, + ] - # We want to test only encoder-decoder models - if not config.is_encoder_decoder: + for model_class in self.all_model_classes: + if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): continue - head_masking = { - "head_mask": tf.zeros((config.encoder_layers, config.encoder_attention_heads)), - "decoder_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)), - "cross_attn_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)), - } + for problem_type in problem_types: + with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"): - signature = inspect.signature(model.call) - if set(head_masking.keys()) < set([*signature.parameters.keys()]): - continue + config.problem_type = problem_type["title"] + config.num_labels = problem_type["num_labels"] - for attn_name, (name, mask) in zip(attention_names, head_masking.items()): - out = model.generate( - inputs_dict["input_ids"], - num_beams=1, - max_length=inputs_dict["input_ids"] + 5, - output_attentions=True, - return_dict_in_generate=True, - **{name: mask}, - ) - # We check the state of decoder_attentions and cross_attentions just from the last step - attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1] - self.assertEqual(sum([tf.reduce_sum(w).numpy() for w in attn_weights]), 0.0) + model = model_class(config) + model.to(torch_device) + model.train() + + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + + if problem_type["num_labels"] > 1: + inputs["labels"] = inputs["labels"].unsqueeze(1).repeat(1, problem_type["num_labels"]) + + inputs["labels"] = inputs["labels"].to(problem_type["dtype"]) + + # This tests that we do not trigger the warning form PyTorch "Using a target size that is different + # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure + # they have the same size." which is a symptom something in wrong for the regression problem. + # See https://github.com/huggingface/transformers/issues/11780 + with warnings.catch_warnings(record=True) as warning_list: + loss = model(**inputs).loss + for w in warning_list: + if "Using a target size that is different to the input size" in str(w.message): + raise ValueError( + f"Something is going wrong in the regression problem: intercepted {w.message}" + ) + + loss.backward() def test_load_with_mismatched_shapes(self): if not self.test_mismatched_shapes: @@ -1387,93 +2166,54 @@ def test_load_with_mismatched_shapes(self): ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): + if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): continue with self.subTest(msg=f"Testing {model_class}"): with tempfile.TemporaryDirectory() as tmp_dir: model = model_class(config) - inputs = self._prepare_for_class(inputs_dict, model_class) - _ = model(**inputs) model.save_pretrained(tmp_dir) # Fails when we don't set ignore_mismatched_sizes=True - with self.assertRaises(ValueError): - new_model = TFAutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42) - with self.assertRaises(ValueError): - new_model_without_prefix = TFAutoModel.from_pretrained(tmp_dir, vocab_size=10) + with self.assertRaises(RuntimeError): + new_model = AutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42) + with self.assertRaises(RuntimeError): + new_model_without_prefix = AutoModel.from_pretrained(tmp_dir, vocab_size=10) + + logger = logging.get_logger("transformers.modeling_utils") - logger = logging.get_logger("transformers.modeling_tf_utils") with CaptureLogger(logger) as cl: - new_model = TFAutoModelForSequenceClassification.from_pretrained( - tmp_dir, num_labels=42, ignore_mismatched_sizes=True + new_model = AutoModelForSequenceClassification.from_pretrained( + tmp_dir, + num_labels=42, + ignore_mismatched_sizes=True, ) self.assertIn("the shapes did not match", cl.out) - + new_model.to(torch_device) + inputs = self._prepare_for_class(inputs_dict, model_class) logits = new_model(**inputs).logits self.assertEqual(logits.shape[1], 42) with CaptureLogger(logger) as cl: - new_model_without_prefix = TFAutoModel.from_pretrained( + new_model_without_prefix = AutoModel.from_pretrained( tmp_dir, vocab_size=10, ignore_mismatched_sizes=True ) self.assertIn("the shapes did not match", cl.out) - - # Although Tf models always have a prefix pointing to `MainLayer`, - # we still add this "without prefix" test to keep a consistency between tf and pt tests. input_ids = ids_tensor((2, 8), 10) + new_model_without_prefix.to(torch_device) if self.is_encoder_decoder: new_model_without_prefix(input_ids, decoder_input_ids=input_ids) else: new_model_without_prefix(input_ids) - def test_model_main_input_name(self): - for model_class in self.all_model_classes: - model_signature = inspect.signature(getattr(model_class, "call")) - # The main input is the name of the argument after `self` - observed_main_input_name = list(model_signature.parameters.keys())[1] - self.assertEqual(model_class.main_input_name, observed_main_input_name) - def _generate_random_bad_tokens(self, num_bad_tokens, model): - # special tokens cannot be bad tokens - special_tokens = [] - if model.config.bos_token_id is not None: - special_tokens.append(model.config.bos_token_id) - if model.config.pad_token_id is not None: - special_tokens.append(model.config.pad_token_id) - if model.config.eos_token_id is not None: - special_tokens.append(model.config.eos_token_id) - - # create random bad tokens that are not special tokens - bad_tokens = [] - while len(bad_tokens) < num_bad_tokens: - token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0] - if token not in special_tokens: - bad_tokens.append(token) - return bad_tokens - - def _check_generated_ids(self, output_ids): - for token_id in output_ids[0].numpy().tolist(): - self.assertGreaterEqual(token_id, 0) - self.assertLess(token_id, self.model_tester.vocab_size) - - def _check_match_tokens(self, generated_ids, bad_words_ids): - # for all bad word tokens - for bad_word_ids in bad_words_ids: - # for all slices in batch - for generated_ids_slice in generated_ids: - # for all word idx - for i in range(len(bad_word_ids), len(generated_ids_slice)): - # if tokens match - if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids: - return True - return False - - -def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): - """Creates a random int32 tensor of the shape within the vocab size.""" +global_rng = random.Random() + + +def ids_tensor(shape, vocab_size, rng=None, name=None): + # Creates a random int32 tensor of the shape within the vocab size if rng is None: - rng = random.Random() + rng = global_rng total_dims = 1 for dim in shape: @@ -1483,28 +2223,20 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): for _ in range(total_dims): values.append(rng.randint(0, vocab_size - 1)) - output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32) + return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous() - return output - -def random_attention_mask(shape, rng=None, name=None, dtype=None): - attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype) +def random_attention_mask(shape, rng=None, name=None): + attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None) # make sure that at least one token is attended to for each batch - attn_mask = tf.concat( - [ - tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), - attn_mask[:, 1:], - ], - axis=1, - ) + attn_mask[:, -1] = 1 return attn_mask -def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None): +def floats_tensor(shape, scale=1.0, rng=None, name=None): """Creates a random float32 tensor""" if rng is None: - rng = random.Random() + rng = global_rng total_dims = 1 for dim in shape: @@ -1514,134 +2246,128 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None): for _ in range(total_dims): values.append(rng.random() * scale) - return tf.reshape( - tf.constant(values, dtype=dtype if dtype is not None else tf.float32), - shape=shape, - ) + return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous() -@require_tf -class UtilsFunctionsTest(unittest.TestCase): - - # tests whether the top_k_top_p_filtering function behaves as expected - def test_top_k_top_p_filtering(self): - logits = tf.convert_to_tensor( - [ - [ - 8.2220991, # 3rd highest value; idx. 0 - -0.5620044, - 5.23229752, - 4.0386393, - -6.8798378, - -0.54785802, - -3.2012153, - 2.92777176, - 1.88171953, - 7.35341276, # 5th highest value; idx. 9 - 8.43207833, # 2nd highest value; idx. 10 - -9.85711836, - -5.96209236, - -1.13039161, - -7.1115294, - -0.8369633, - -5.3186408, - 7.06427407, - 0.81369344, - -0.82023817, - -5.9179796, - 0.58813443, - -6.99778438, - 4.71551189, - -0.18771637, - 7.44020759, # 4th highest value; idx. 25 - 9.38450987, # 1st highest value; idx. 26 - 2.12662941, - -9.32562038, - 2.35652522, - ], # cummulative prob of 5 highest values <= 0.6 - [ - 0.58425518, - 4.53139238, - -5.57510464, - -6.28030699, - -7.19529503, - -4.02122551, - 1.39337037, - -6.06707057, - 1.59480517, - -9.643119, - 0.03907799, - 0.67231762, - -8.88206726, - 6.27115922, # 4th highest value; idx. 13 - 2.28520723, - 4.82767506, - 4.30421368, - 8.8275313, # 2nd highest value; idx. 17 - 5.44029958, # 5th highest value; idx. 18 - -4.4735794, - 7.38579536, # 3rd highest value; idx. 20 - -2.91051663, - 2.61946077, - -2.5674762, - -9.48959302, - -4.02922645, - -1.35416918, - 9.67702323, # 1st highest value; idx. 27 - -5.89478553, - 1.85370467, - ], # cummulative prob of 5 highest values <= 0.6 - ], - dtype=tf.float32, - ) +@require_torch +class ModelUtilsTest(TestCasePlus): + @slow + def test_model_from_pretrained(self): + for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + config = BertConfig.from_pretrained(model_name) + self.assertIsNotNone(config) + self.assertIsInstance(config, PretrainedConfig) - non_inf_expected_idx = tf.convert_to_tensor( - [ - [0, 0], - [0, 9], - [0, 10], - [0, 25], - [0, 26], - [1, 13], - [1, 17], - [1, 18], - [1, 20], - [1, 27], - ], - dtype=tf.int32, - ) # expected non filtered idx as noted above - - non_inf_expected_output = tf.convert_to_tensor( - [ - 8.222099, - 7.3534126, - 8.432078, - 7.4402075, - 9.38451, - 6.271159, - 8.827531, - 5.4402995, - 7.3857956, - 9.677023, - ], - dtype=tf.float32, - ) # expected non filtered values as noted above - - output = tf_top_k_top_p_filtering(logits, top_k=10, top_p=0.6, min_tokens_to_keep=4) - - non_inf_output = output[output != -float("inf")] - non_inf_idx = tf.cast( - tf.where(tf.not_equal(output, tf.constant(-float("inf"), dtype=tf.float32))), - dtype=tf.int32, - ) + model = BertModel.from_pretrained(model_name) + model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True) + self.assertIsNotNone(model) + self.assertIsInstance(model, PreTrainedModel) - tf.debugging.assert_near(non_inf_output, non_inf_expected_output, rtol=1e-12) - tf.debugging.assert_equal(non_inf_idx, non_inf_expected_idx) + self.assertEqual(len(loading_info["missing_keys"]), 0) + self.assertEqual(len(loading_info["unexpected_keys"]), 8) + self.assertEqual(len(loading_info["mismatched_keys"]), 0) + self.assertEqual(len(loading_info["error_msgs"]), 0) + config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True) -@require_tf + # Not sure this is the intended behavior. TODO fix Lysandre & Thom + config.name_or_path = model_name + + model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True) + self.assertEqual(model.config.output_hidden_states, True) + self.assertEqual(model.config, config) + + def test_model_from_pretrained_with_different_pretrained_model_name(self): + model = T5ForConditionalGeneration.from_pretrained(TINY_T5) + self.assertIsNotNone(model) + + logger = logging.get_logger("transformers.configuration_utils") + with CaptureLogger(logger) as cl: + BertModel.from_pretrained(TINY_T5) + self.assertTrue("You are using a model of type t5 to instantiate a model of type bert" in cl.out) + + @require_torch + def test_model_from_config_torch_dtype(self): + # test that the model can be instantiated with dtype of user's choice - as long as it's a + # float dtype. To make it happen config.torch_dtype needs to be set before instantiating the + # model from the config object. + + config = T5Config.from_pretrained(TINY_T5) + model = AutoModel.from_config(config) + # XXX: isn't supported + # model = T5ForConditionalGeneration.from_config(config) + self.assertEqual(model.dtype, torch.float32) + + model = AutoModel.from_config(config, torch_dtype=torch.float16) + self.assertEqual(model.dtype, torch.float16) + + # torch.set_default_dtype() supports only float dtypes, so will fail with non-float type + with self.assertRaises(ValueError): + model = AutoModel.from_config(config, torch_dtype=torch.int64) + + @require_torch + def test_model_from_pretrained_torch_dtype(self): + # test that the model can be instantiated with dtype of either + # 1. explicit from_pretrained's torch_dtype argument + # 2. via autodiscovery by looking at model weights (torch_dtype="auto") + # so if a model.half() was saved, we want it to be instantiated as such. + # + # test an explicit model class, but also AutoModel separately as the latter goes through a different code path + model_path = self.get_auto_remove_tmp_dir() + + # baseline - we know TINY_T5 is fp32 model + model = T5ForConditionalGeneration.from_pretrained(TINY_T5) + self.assertEqual(model.dtype, torch.float32) + + # test the default fp32 save_pretrained => from_pretrained cycle + model.save_pretrained(model_path) + model = T5ForConditionalGeneration.from_pretrained(model_path) + self.assertEqual(model.dtype, torch.float32) + # test with auto-detection + model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto") + self.assertEqual(model.dtype, torch.float32) + + # test forced loading in fp16 (even though the weights are in fp32) + model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16) + self.assertEqual(model.dtype, torch.float16) + + # test fp16 save_pretrained, loaded with auto-detection + model = model.half() + model.save_pretrained(model_path) + model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto") + self.assertEqual(model.config.torch_dtype, torch.float16) + self.assertEqual(model.dtype, torch.float16) + + # tests `config.torch_dtype` saving + with open(f"{model_path}/config.json") as f: + config_dict = json.load(f) + self.assertEqual(config_dict["torch_dtype"], "float16") + + # test fp16 save_pretrained, loaded with the explicit fp16 + model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16) + self.assertEqual(model.dtype, torch.float16) + + # test AutoModel separately as it goes through a different path + # test auto-detection + model = AutoModel.from_pretrained(TINY_T5, torch_dtype="auto") + self.assertEqual(model.dtype, torch.float32) + # test forcing an explicit dtype + model = AutoModel.from_pretrained(TINY_T5, torch_dtype=torch.float16) + self.assertEqual(model.dtype, torch.float16) + + def test_no_super_init_config_and_model(self): + config = NoSuperInitConfig(attribute=32) + model = NoSuperInitModel(config) + + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + + model = NoSuperInitModel.from_pretrained(tmp_dir) + + +@require_torch @is_staging_test -class TFModelPushToHubTester(unittest.TestCase): +class ModelPushToHubTester(unittest.TestCase): @classmethod def setUpClass(cls): cls._token = login(username=USER, password=PASS) @@ -1649,19 +2375,29 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): try: - delete_repo(token=cls._token, name="test-model-tf") + delete_repo(token=cls._token, name="test-model") except HTTPError: pass try: delete_repo( token=cls._token, - name="test-model-tf-org", + name="test-model-org", organization="valid_org", ) except HTTPError: pass + try: + delete_repo(token=cls._token, name="test-dynamic-model") + except HTTPError: + pass + + try: + delete_repo(token=cls._token, name="test-dynamic-model-config") + except HTTPError: + pass + def test_push_to_hub(self): config = BertConfig( vocab_size=99, @@ -1670,35 +2406,17 @@ def test_push_to_hub(self): num_attention_heads=4, intermediate_size=37, ) - model = TFBertModel(config) - # Make sure model is properly initialized - _ = model(model.dummy_inputs) + model = BertModel(config) with tempfile.TemporaryDirectory() as tmp_dir: model.save_pretrained( - os.path.join(tmp_dir, "test-model-tf"), + os.path.join(tmp_dir, "test-model"), push_to_hub=True, use_auth_token=self._token, ) - new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf") - models_equal = True - for p1, p2 in zip(model.weights, new_model.weights): - if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: - models_equal = False - self.assertTrue(models_equal) - - def test_push_to_hub_with_model_card(self): - config = BertConfig( - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - ) - model = TFBertModel(config) - with tempfile.TemporaryDirectory() as tmp_dir: - model.push_to_hub(os.path.join(tmp_dir, "test-model-tf")) - self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "test-model-card-tf", "README.md"))) + new_model = BertModel.from_pretrained(f"{USER}/test-model") + for p1, p2 in zip(model.parameters(), new_model.parameters()): + self.assertTrue(torch.equal(p1, p2)) def test_push_to_hub_in_organization(self): config = BertConfig( @@ -1708,18 +2426,50 @@ def test_push_to_hub_in_organization(self): num_attention_heads=4, intermediate_size=37, ) - model = TFBertModel(config) + model = BertModel(config) with tempfile.TemporaryDirectory() as tmp_dir: model.save_pretrained( - os.path.join(tmp_dir, "test-model-tf-org"), + os.path.join(tmp_dir, "test-model-org"), push_to_hub=True, use_auth_token=self._token, organization="valid_org", ) - new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org") - models_equal = True - for p1, p2 in zip(model.weights, new_model.weights): - if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: - models_equal = False - self.assertTrue(models_equal) + new_model = BertModel.from_pretrained("valid_org/test-model-org") + for p1, p2 in zip(model.parameters(), new_model.parameters()): + self.assertTrue(torch.equal(p1, p2)) + + def test_push_to_hub_dynamic_model(self): + CustomConfig.register_for_auto_class() + CustomModel.register_for_auto_class() + + config = CustomConfig(hidden_size=32) + model = CustomModel(config) + + with tempfile.TemporaryDirectory() as tmp_dir: + repo = Repository( + tmp_dir, + clone_from=f"{USER}/test-dynamic-model", + use_auth_token=self._token, + ) + model.save_pretrained(tmp_dir) + # checks + self.assertDictEqual( + config.auto_map, + { + "AutoConfig": "custom_configuration.CustomConfig", + "AutoModel": "custom_modeling.CustomModel", + }, + ) + + repo.push_to_hub() + + new_model = AutoModel.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True) + # Can't make an isinstance check because the new_model is from the CustomModel class of a dynamic module + self.assertEqual(new_model.__class__.__name__, "CustomModel") + for p1, p2 in zip(model.parameters(), new_model.parameters()): + self.assertTrue(torch.equal(p1, p2)) + + config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True) + new_model = AutoModel.from_config(config, trust_remote_code=True) + self.assertEqual(new_model.__class__.__name__, "CustomModel") From 4dea175013e59ff1838185f06aa6e60c95a3fd66 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 24 Feb 2022 15:45:46 +0530 Subject: [PATCH 57/65] chore: revert to the original test_modeling_common.py --- tests/test_modeling_common.py | 452 +++++++--------------------------- 1 file changed, 86 insertions(+), 366 deletions(-) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 348ffcd2c4490..17888bcfac380 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -140,21 +140,13 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): if return_labels: if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING): - inputs_dict["labels"] = torch.ones( - self.model_tester.batch_size, - dtype=torch.long, - device=torch_device, - ) + inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device) elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING): inputs_dict["start_positions"] = torch.zeros( - self.model_tester.batch_size, - dtype=torch.long, - device=torch_device, + self.model_tester.batch_size, dtype=torch.long, device=torch_device ) inputs_dict["end_positions"] = torch.zeros( - self.model_tester.batch_size, - dtype=torch.long, - device=torch_device, + self.model_tester.batch_size, dtype=torch.long, device=torch_device ) elif model_class in [ *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING), @@ -162,9 +154,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING), ]: inputs_dict["labels"] = torch.zeros( - self.model_tester.batch_size, - dtype=torch.long, - device=torch_device, + self.model_tester.batch_size, dtype=torch.long, device=torch_device ) elif model_class in [ *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING), @@ -174,27 +164,17 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING), ]: inputs_dict["labels"] = torch.zeros( - ( - self.model_tester.batch_size, - self.model_tester.seq_length, - ), - dtype=torch.long, - device=torch_device, + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device ) elif model_class in get_values(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING): num_patches = self.model_tester.image_size // self.model_tester.patch_size inputs_dict["bool_masked_pos"] = torch.zeros( - (self.model_tester.batch_size, num_patches ** 2), - dtype=torch.long, - device=torch_device, + (self.model_tester.batch_size, num_patches**2), dtype=torch.long, device=torch_device ) return inputs_dict def test_save_load(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -220,10 +200,7 @@ def test_save_load(self): self.assertLessEqual(max_diff, 1e-5) def test_save_load_keys_to_ignore_on_save(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -233,11 +210,7 @@ def test_save_load_keys_to_ignore_on_save(self): # check the keys are in the original state_dict for k in _keys_to_ignore_on_save: - self.assertIn( - k, - model.state_dict().keys(), - "\n".join(model.state_dict().keys()), - ) + self.assertIn(k, model.state_dict().keys(), "\n".join(model.state_dict().keys())) # check that certain keys didn't get saved with the model with tempfile.TemporaryDirectory() as tmpdirname: @@ -245,11 +218,7 @@ def test_save_load_keys_to_ignore_on_save(self): output_model_file = os.path.join(tmpdirname, WEIGHTS_NAME) state_dict_saved = torch.load(output_model_file) for k in _keys_to_ignore_on_save: - self.assertNotIn( - k, - state_dict_saved.keys(), - "\n".join(state_dict_saved.keys()), - ) + self.assertNotIn(k, state_dict_saved.keys(), "\n".join(state_dict_saved.keys())) # Test we can load the state dict in the model, necessary for the checkpointing API in Trainer. load_result = model.load_state_dict(state_dict_saved, strict=False) @@ -260,10 +229,7 @@ def test_save_load_keys_to_ignore_on_save(self): self.assertTrue(len(load_result.unexpected_keys) == 0) def test_gradient_checkpointing_backward_compatibility(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: if not model_class.supports_gradient_checkpointing: @@ -274,10 +240,7 @@ def test_gradient_checkpointing_backward_compatibility(self): self.assertTrue(model.is_gradient_checkpointing) def test_gradient_checkpointing_enable_disable(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: if not model_class.supports_gradient_checkpointing: @@ -302,10 +265,7 @@ def _mock_init_weights(self, module): module.bias.data.fill_(3) def test_save_load_fast_init_from_base(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() base_class = MODEL_MAPPING[config.__class__] if isinstance(base_class, tuple): @@ -350,10 +310,7 @@ class CopyClass(model_class): self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical") def test_save_load_fast_init_to_base(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() base_class = MODEL_MAPPING[config.__class__] if isinstance(base_class, tuple): @@ -399,10 +356,7 @@ class CopyClass(base_class): self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical") def test_initialization(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() configs_no_init = _config_zero_init(config) for model_class in self.all_model_classes: @@ -416,10 +370,7 @@ def test_initialization(self): ) def test_determinism(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -453,12 +404,7 @@ def test_forward_signature(self): "decoder_attention_mask", ] expected_arg_names.extend( - [ - "head_mask", - "decoder_head_mask", - "cross_attn_head_mask", - "encoder_outputs", - ] + ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"] if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names else ["encoder_outputs"] ) @@ -472,10 +418,7 @@ def test_training(self): return for model_class in self.all_model_classes: - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True if model_class in get_values(MODEL_MAPPING): @@ -493,10 +436,7 @@ def test_training_gradient_checkpointing(self): return for model_class in self.all_model_classes: - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.use_cache = False config.return_dict = True @@ -511,10 +451,7 @@ def test_training_gradient_checkpointing(self): loss.backward() def test_attention_outputs(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True seq_len = getattr(self.model_tester, "seq_length", None) @@ -552,21 +489,12 @@ def test_attention_outputs(self): if chunk_length is not None: self.assertListEqual( list(attentions[0].shape[-4:]), - [ - self.model_tester.num_attention_heads, - encoder_seq_length, - chunk_length, - encoder_key_length, - ], + [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length], ) else: self.assertListEqual( list(attentions[0].shape[-3:]), - [ - self.model_tester.num_attention_heads, - encoder_seq_length, - encoder_key_length, - ], + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], ) out_len = len(outputs) @@ -590,11 +518,7 @@ def test_attention_outputs(self): self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(decoder_attentions[0].shape[-3:]), - [ - self.model_tester.num_attention_heads, - decoder_seq_length, - decoder_key_length, - ], + [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], ) # cross attentions @@ -633,46 +557,28 @@ def test_attention_outputs(self): if chunk_length is not None: self.assertListEqual( list(self_attentions[0].shape[-4:]), - [ - self.model_tester.num_attention_heads, - encoder_seq_length, - chunk_length, - encoder_key_length, - ], + [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length], ) else: self.assertListEqual( list(self_attentions[0].shape[-3:]), - [ - self.model_tester.num_attention_heads, - encoder_seq_length, - encoder_key_length, - ], + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], ) @slow def test_torchscript(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() self._create_and_check_torchscript(config, inputs_dict) @slow def test_torchscript_output_attentions(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.output_attentions = True self._create_and_check_torchscript(config, inputs_dict) @slow def test_torchscript_output_hidden_state(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.output_hidden_states = True self._create_and_check_torchscript(config, inputs_dict) @@ -696,13 +602,7 @@ def _create_and_check_torchscript(self, config, inputs_dict): decoder_input_ids = inputs["decoder_input_ids"] decoder_attention_mask = inputs["decoder_attention_mask"] traced_model = torch.jit.trace( - model, - ( - input_ids, - attention_mask, - decoder_input_ids, - decoder_attention_mask, - ), + model, (input_ids, attention_mask, decoder_input_ids, decoder_attention_mask) ) else: input_ids = inputs["input_ids"] @@ -741,10 +641,7 @@ def _create_and_check_torchscript(self, config, inputs_dict): key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers } - self.assertEqual( - set(model_state_dict.keys()), - set(loaded_model_state_dict.keys()), - ) + self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys())) model_buffers = list(model.buffers()) for non_persistent_buffer in non_persistent_buffers.values(): @@ -767,17 +664,11 @@ def _create_and_check_torchscript(self, config, inputs_dict): self.assertTrue(models_equal) def test_torch_fx(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() self._create_and_check_torch_fx_tracing(config, inputs_dict) def test_torch_fx_output_loss(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() self._create_and_check_torch_fx_tracing(config, inputs_dict, output_loss=True) def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False): @@ -797,12 +688,7 @@ def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=Fa if model.config.is_encoder_decoder: model.config.use_cache = False # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward labels = inputs.get("labels", None) - input_names = [ - "input_ids", - "attention_mask", - "decoder_input_ids", - "decoder_attention_mask", - ] + input_names = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"] if labels is not None: input_names.append("labels") filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names} @@ -812,11 +698,7 @@ def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=Fa traced_model = symbolic_trace(model, input_names) traced_output = traced_model(**filtered_inputs) else: - input_names = [ - "input_ids", - "attention_mask", - "token_type_ids", - ] + input_names = ["input_ids", "attention_mask", "token_type_ids"] input_ids = inputs["input_ids"] labels = inputs.get("labels", None) @@ -872,10 +754,7 @@ def test_headmasking(self): return global_rng.seed(42) - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() global_rng.seed() inputs_dict["output_attentions"] = True @@ -970,10 +849,7 @@ def test_head_pruning(self): self.assertEqual(attentions[0].shape[-3], 1) self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual( - attentions[-1].shape[-3], - self.model_tester.num_attention_heads - 1, - ) + self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) def test_head_pruning_save_load_from_pretrained(self): if not self.test_pruning: @@ -1009,10 +885,7 @@ def test_head_pruning_save_load_from_pretrained(self): attentions = outputs[-1] self.assertEqual(attentions[0].shape[-3], 1) self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual( - attentions[-1].shape[-3], - self.model_tester.num_attention_heads - 1, - ) + self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) def test_head_pruning_save_load_from_config_init(self): if not self.test_pruning: @@ -1046,10 +919,7 @@ def test_head_pruning_save_load_from_config_init(self): self.assertEqual(attentions[0].shape[-3], 1) self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual( - attentions[-1].shape[-3], - self.model_tester.num_attention_heads - 1, - ) + self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) def test_head_pruning_integration(self): if not self.test_pruning: @@ -1078,14 +948,8 @@ def test_head_pruning_integration(self): outputs = model(**self._prepare_for_class(inputs_dict, model_class)) attentions = outputs[-1] - self.assertEqual( - attentions[0].shape[-3], - self.model_tester.num_attention_heads - 1, - ) - self.assertEqual( - attentions[1].shape[-3], - self.model_tester.num_attention_heads - 2, - ) + self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) + self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads) self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) @@ -1098,14 +962,8 @@ def test_head_pruning_integration(self): outputs = model(**self._prepare_for_class(inputs_dict, model_class)) attentions = outputs[-1] - self.assertEqual( - attentions[0].shape[-3], - self.model_tester.num_attention_heads - 1, - ) - self.assertEqual( - attentions[1].shape[-3], - self.model_tester.num_attention_heads - 2, - ) + self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) + self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads) self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) @@ -1116,18 +974,9 @@ def test_head_pruning_integration(self): outputs = model(**self._prepare_for_class(inputs_dict, model_class)) attentions = outputs[-1] - self.assertEqual( - attentions[0].shape[-3], - self.model_tester.num_attention_heads - 1, - ) - self.assertEqual( - attentions[1].shape[-3], - self.model_tester.num_attention_heads - 2, - ) - self.assertEqual( - attentions[2].shape[-3], - self.model_tester.num_attention_heads - 2, - ) + self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) + self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) + self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2) self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]}) @@ -1144,9 +993,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states expected_num_layers = getattr( - self.model_tester, - "expected_num_hidden_layers", - self.model_tester.num_hidden_layers + 1, + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 ) self.assertEqual(len(hidden_states), expected_num_layers) @@ -1175,10 +1022,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): [decoder_seq_length, self.model_tester.hidden_size], ) - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True @@ -1191,10 +1035,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) def test_retain_grad_hidden_states_attentions(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.output_hidden_states = True config.output_attentions = True @@ -1288,10 +1129,7 @@ def test_resize_position_vector_embeddings(self): # Retrieve the embeddings and clone theme if model.config.is_encoder_decoder: - ( - encoder_model_embed, - decoder_model_embed, - ) = model.get_position_embeddings() + encoder_model_embed, decoder_model_embed = model.get_position_embeddings() encoder_cloned_embeddings = encoder_model_embed.weight.clone() decoder_cloned_embeddings = decoder_model_embed.weight.clone() else: @@ -1301,25 +1139,13 @@ def test_resize_position_vector_embeddings(self): # Check that resizing the position embeddings with a larger max_position_embeddings increases # the model's postion embeddings size model.resize_position_embeddings(max_position_embeddings + 10) - self.assertEqual( - model.config.max_position_embeddings, - max_position_embeddings + 10, - ) + self.assertEqual(model.config.max_position_embeddings, max_position_embeddings + 10) # Check that it actually resizes the embeddings matrix if model.config.is_encoder_decoder: - ( - encoder_model_embed, - decoder_model_embed, - ) = model.get_position_embeddings() - self.assertEqual( - encoder_model_embed.weight.shape[0], - encoder_cloned_embeddings.shape[0] + 10, - ) - self.assertEqual( - decoder_model_embed.weight.shape[0], - decoder_cloned_embeddings.shape[0] + 10, - ) + encoder_model_embed, decoder_model_embed = model.get_position_embeddings() + self.assertEqual(encoder_model_embed.weight.shape[0], encoder_cloned_embeddings.shape[0] + 10) + self.assertEqual(decoder_model_embed.weight.shape[0], decoder_cloned_embeddings.shape[0] + 10) else: model_embed = model.get_position_embeddings() self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) @@ -1330,25 +1156,13 @@ def test_resize_position_vector_embeddings(self): # Check that resizing the position embeddings with a smaller max_position_embeddings decreases # the model's max_position_embeddings model.resize_position_embeddings(max_position_embeddings - 5) - self.assertEqual( - model.config.max_position_embeddings, - max_position_embeddings - 5, - ) + self.assertEqual(model.config.max_position_embeddings, max_position_embeddings - 5) # Check that it actually resizes the embeddings matrix if model.config.is_encoder_decoder: - ( - encoder_model_embed, - decoder_model_embed, - ) = model.get_position_embeddings() - self.assertEqual( - encoder_model_embed.weight.shape[0], - encoder_cloned_embeddings.shape[0] - 5, - ) - self.assertEqual( - decoder_model_embed.weight.shape[0], - decoder_cloned_embeddings.shape[0] - 5, - ) + encoder_model_embed, decoder_model_embed = model.get_position_embeddings() + self.assertEqual(encoder_model_embed.weight.shape[0], encoder_cloned_embeddings.shape[0] - 5) + self.assertEqual(decoder_model_embed.weight.shape[0], decoder_cloned_embeddings.shape[0] - 5) else: model_embed = model.get_position_embeddings() self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 5) @@ -1477,10 +1291,7 @@ def test_resize_embeddings_untied(self): model(**self._prepare_for_class(inputs_dict, model_class)) def test_model_common_attributes(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -1516,10 +1327,7 @@ def test_tie_model_weights(self): if not self.test_torchscript: return - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def check_same_values(layer_1, layer_2): equal = True @@ -1564,10 +1372,7 @@ def check_same_values(layer_1, layer_2): # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head)) def test_model_outputs_equivalence(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def set_nan_tensor_to_zero(t): t[t != t] = 0 @@ -1592,9 +1397,7 @@ def recursive_check(tuple_object, dict_object): else: self.assertTrue( torch.allclose( - set_nan_tensor_to_zero(tuple_object), - set_nan_tensor_to_zero(dict_object), - atol=1e-5, + set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5 ), msg=f"Tuple and dict output are not equal. Difference: {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`: {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}.", ) @@ -1633,10 +1436,7 @@ def recursive_check(tuple_object, dict_object): tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence( - model, - tuple_inputs, - dict_inputs, - {"output_hidden_states": True, "output_attentions": True}, + model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} ) @is_pt_tf_cross_test @@ -1646,10 +1446,7 @@ def test_pt_tf_model_equivalence(self): import transformers - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: tf_model_class_name = "TF" + model_class.__name__ # Add the "TF" at the beginning @@ -1769,18 +1566,11 @@ def test_pt_tf_model_equivalence(self): def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float): diff = np.abs((a - b)).max() - self.assertLessEqual( - diff, - tol, - f"Difference between torch and flax is {diff} (>= {tol}).", - ) + self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).") @is_pt_flax_cross_test def test_equivalence_pt_to_flax(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: with self.subTest(model_class.__name__): @@ -1818,11 +1608,7 @@ def test_equivalence_pt_to_flax(self): # convert inputs to Flax fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)} fx_outputs = fx_model(**fx_inputs).to_tuple() - self.assertEqual( - len(fx_outputs), - len(pt_outputs), - "Output lengths differ between Flax and PyTorch", - ) + self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch") for fx_output, pt_output in zip(fx_outputs, pt_outputs): self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2) @@ -1832,19 +1618,14 @@ def test_equivalence_pt_to_flax(self): fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple() self.assertEqual( - len(fx_outputs_loaded), - len(pt_outputs), - "Output lengths differ between Flax and PyTorch", + len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch" ) for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs): self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2) @is_pt_flax_cross_test def test_equivalence_flax_to_pt(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: with self.subTest(model_class.__name__): @@ -1884,11 +1665,7 @@ def test_equivalence_flax_to_pt(self): fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)} fx_outputs = fx_model(**fx_inputs).to_tuple() - self.assertEqual( - len(fx_outputs), - len(pt_outputs), - "Output lengths differ between Flax and PyTorch", - ) + self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch") for fx_output, pt_output in zip(fx_outputs, pt_outputs): self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2) @@ -1901,18 +1678,13 @@ def test_equivalence_flax_to_pt(self): pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple() self.assertEqual( - len(fx_outputs), - len(pt_outputs_loaded), - "Output lengths differ between Flax and PyTorch", + len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch" ) for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded): self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2) def test_inputs_embeds(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -1942,18 +1714,11 @@ def test_inputs_embeds(self): @require_torch_multi_gpu def test_multi_gpu_data_parallel_forward(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() # some params shouldn't be scattered by nn.DataParallel # so just remove them if they are present. - blacklist_non_batched_params = [ - "head_mask", - "decoder_head_mask", - "cross_attn_head_mask", - ] + blacklist_non_batched_params = ["head_mask", "decoder_head_mask", "cross_attn_head_mask"] for k in blacklist_non_batched_params: inputs_dict.pop(k, None) @@ -2039,10 +1804,7 @@ def test_model_parallel_equal_results(self): if not self.test_model_parallel: return - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_parallelizable_model_classes: inputs_dict = self._prepare_for_class(inputs_dict, model_class) @@ -2080,10 +1842,7 @@ def test_model_parallel_beam_search(self): set(self.all_generative_model_classes).intersection(self.all_parallelizable_model_classes) ) - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in all_generative_and_parallelizable_model_classes: inputs_dict = self._prepare_for_class(inputs_dict, model_class) @@ -2103,22 +1862,11 @@ def cast_to_device(dictionary, device): model.generate(**cast_to_device(inputs_dict, "cuda:0"), num_beams=2) def test_problem_types(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() problem_types = [ - { - "title": "multi_label_classification", - "num_labels": 2, - "dtype": torch.float, - }, - { - "title": "single_label_classification", - "num_labels": 1, - "dtype": torch.long, - }, + {"title": "multi_label_classification", "num_labels": 2, "dtype": torch.float}, + {"title": "single_label_classification", "num_labels": 1, "dtype": torch.long}, {"title": "regression", "num_labels": 1, "dtype": torch.float}, ] @@ -2160,10 +1908,7 @@ def test_problem_types(self): def test_load_with_mismatched_shapes(self): if not self.test_mismatched_shapes: return - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): @@ -2184,9 +1929,7 @@ def test_load_with_mismatched_shapes(self): with CaptureLogger(logger) as cl: new_model = AutoModelForSequenceClassification.from_pretrained( - tmp_dir, - num_labels=42, - ignore_mismatched_sizes=True, + tmp_dir, num_labels=42, ignore_mismatched_sizes=True ) self.assertIn("the shapes did not match", cl.out) new_model.to(torch_device) @@ -2380,11 +2123,7 @@ def tearDownClass(cls): pass try: - delete_repo( - token=cls._token, - name="test-model-org", - organization="valid_org", - ) + delete_repo(token=cls._token, name="test-model-org", organization="valid_org") except HTTPError: pass @@ -2400,19 +2139,11 @@ def tearDownClass(cls): def test_push_to_hub(self): config = BertConfig( - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, + vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 ) model = BertModel(config) with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained( - os.path.join(tmp_dir, "test-model"), - push_to_hub=True, - use_auth_token=self._token, - ) + model.save_pretrained(os.path.join(tmp_dir, "test-model"), push_to_hub=True, use_auth_token=self._token) new_model = BertModel.from_pretrained(f"{USER}/test-model") for p1, p2 in zip(model.parameters(), new_model.parameters()): @@ -2420,11 +2151,7 @@ def test_push_to_hub(self): def test_push_to_hub_in_organization(self): config = BertConfig( - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, + vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 ) model = BertModel(config) with tempfile.TemporaryDirectory() as tmp_dir: @@ -2447,19 +2174,12 @@ def test_push_to_hub_dynamic_model(self): model = CustomModel(config) with tempfile.TemporaryDirectory() as tmp_dir: - repo = Repository( - tmp_dir, - clone_from=f"{USER}/test-dynamic-model", - use_auth_token=self._token, - ) + repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-model", use_auth_token=self._token) model.save_pretrained(tmp_dir) # checks self.assertDictEqual( config.auto_map, - { - "AutoConfig": "custom_configuration.CustomConfig", - "AutoModel": "custom_modeling.CustomModel", - }, + {"AutoConfig": "custom_configuration.CustomConfig", "AutoModel": "custom_modeling.CustomModel"}, ) repo.push_to_hub() From 0f8069d656ce99b9c6d05ef5c058e82fdd6c71f2 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 25 Feb 2022 07:30:33 +0530 Subject: [PATCH 58/65] chore: revert to previous states for test_modeling_tf_common.py and modeling_tf_utils.py --- src/transformers/modeling_tf_utils.py | 125 ++------ tests/test_modeling_tf_common.py | 395 +++++--------------------- 2 files changed, 97 insertions(+), 423 deletions(-) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 4637130e7771c..8d2ad8d10c081 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -312,10 +312,9 @@ def booleans_processing(config, **kwargs): if tf.executing_eagerly(): # Pure conv models (such as ConvNext) do not have `output_attentions` - final_booleans["output_attentions"] = kwargs.get("output_attentions", None) - if final_booleans["output_attentions"] is None: - final_booleans["output_attentions"] = config.output_attentions - + final_booleans["output_attentions"] = ( + kwargs["output_attentions"] if kwargs["output_attentions"] is not None else config.output_attentions + ) final_booleans["output_hidden_states"] = ( kwargs["output_hidden_states"] if kwargs["output_hidden_states"] is not None @@ -367,17 +366,7 @@ def input_processing(func, config, input_ids, **kwargs): signature.pop("self", None) parameter_names = list(signature.keys()) output = {} - allowed_types = ( - tf.Tensor, - bool, - int, - ModelOutput, - tuple, - list, - dict, - np.ndarray, - KerasTensor, - ) + allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray, KerasTensor) if "inputs" in kwargs["kwargs_call"]: warnings.warn( @@ -490,13 +479,7 @@ def input_processing(func, config, input_ids, **kwargs): boolean_dict = { k: v for k, v in output.items() - if k - in [ - "return_dict", - "output_attentions", - "output_hidden_states", - "use_cache", - ] + if k in ["return_dict", "output_attentions", "output_hidden_states", "use_cache"] } output.update( @@ -595,18 +578,11 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False, # If yes we reshape the weight from the H5 file accordingly to the current weight # If the two shapes are not compatible we raise an issue try: - array = np.reshape( - saved_weight_value, - K.int_shape(symbolic_weight), - ) + array = np.reshape(saved_weight_value, K.int_shape(symbolic_weight)) except ValueError as e: if ignore_mismatched_sizes: mismatched_layers.append( - ( - symbolic_weight_name, - saved_weight_value.shape, - K.int_shape(symbolic_weight), - ) + (symbolic_weight_name, saved_weight_value.shape, K.int_shape(symbolic_weight)) ) continue else: @@ -650,17 +626,11 @@ def init_copy_embeddings(old_embeddings, new_num_tokens): # and we create a mask to properly identify the padded values and be replaced by the values of the newly created # embeddings current_weights = tf.pad( - old_embeddings.value(), - tf.convert_to_tensor([[0, size_diff], [0, 0]]), - constant_values=-1, + old_embeddings.value(), tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=-1 ) num_tokens_to_copy = min(old_num_tokens, new_num_tokens) mask = tf.fill(tf.convert_to_tensor([num_tokens_to_copy, 1]), True) - mask = tf.pad( - mask, - tf.convert_to_tensor([[0, size_diff], [0, 0]]), - constant_values=False, - ) + mask = tf.pad(mask, tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=False) else: # if the new size if lower than the old one, we take the current embeddings until the new size current_weights = tf.slice( @@ -805,10 +775,7 @@ def _save_checkpoint(self, checkpoint_dir, epoch): # internally and which users are likely to use too weights_path = os.path.join(checkpoint_dir, "weights.h5") self.save_weights(weights_path) - extra_data = { - "epoch": epoch, - "optimizer_state": self.optimizer.get_weights(), - } + extra_data = {"epoch": epoch, "optimizer_state": self.optimizer.get_weights()} extra_data_path = os.path.join(checkpoint_dir, "extra_data.pickle") with open(extra_data_path, "wb") as f: pickle.dump(extra_data, f) @@ -834,10 +801,7 @@ def load_repo_checkpoint(self, repo_path_or_name): if not os.path.isdir(repo_path_or_name): # If this isn't a local path, check that the remote repo exists and has a checkpoint in it repo_files = list_repo_files(repo_path_or_name) - for file in ( - "checkpoint/weights.h5", - "checkpoint/extra_data.pickle", - ): + for file in ("checkpoint/weights.h5", "checkpoint/extra_data.pickle"): if file not in repo_files: raise FileNotFoundError(f"Repo {repo_path_or_name} does not contain checkpoint file {file}!") if "/" not in repo_path_or_name: @@ -845,10 +809,7 @@ def load_repo_checkpoint(self, repo_path_or_name): repo_path_or_name = self.get_full_repo_name(repo_path_or_name) else: model_id = repo_path_or_name.split("/")[-1] - repo = Repository( - model_id, - clone_from=f"https://huggingface.co/{repo_path_or_name}", - ) + repo = Repository(model_id, clone_from=f"https://huggingface.co/{repo_path_or_name}") local_dir = repo.local_dir else: local_dir = repo_path_or_name @@ -1105,8 +1066,7 @@ def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]: `tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model. """ warnings.warn( - "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", - FutureWarning, + "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning ) return self.get_lm_head() @@ -1117,10 +1077,7 @@ def get_prefix_bias_name(self) -> Union[None, str]: Return: `str`: The _prefix name of the bias. """ - warnings.warn( - "The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", - FutureWarning, - ) + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) return None def get_bias(self) -> Union[None, Dict[str, tf.Variable]]: @@ -1268,25 +1225,15 @@ def _get_resized_lm_head_bias(self, old_lm_head_bias, new_num_tokens): # initialize new bias if tf.math.greater(size_diff, 0): padding_shape = [[0, size_diff]] if first_dim is None else [[0, 0], [0, size_diff]] - current_bias = tf.pad( - weight.value(), - tf.convert_to_tensor(padding_shape), - constant_values=-1, - ) + current_bias = tf.pad(weight.value(), tf.convert_to_tensor(padding_shape), constant_values=-1) num_tokens_to_copy = min(old_num_tokens, new_num_tokens) mask_shape = [num_tokens_to_copy] if first_dim is None else [1, num_tokens_to_copy] bias_mask = tf.fill(tf.convert_to_tensor(mask_shape), True) - bias_mask = tf.pad( - bias_mask, - tf.convert_to_tensor(padding_shape), - constant_values=False, - ) + bias_mask = tf.pad(bias_mask, tf.convert_to_tensor(padding_shape), constant_values=False) else: slice_from = [0] if first_dim is None else [0, 0] current_bias = tf.slice( - weight.value(), - tf.convert_to_tensor(slice_from), - tf.convert_to_tensor(final_shape), + weight.value(), tf.convert_to_tensor(slice_from), tf.convert_to_tensor(final_shape) ) bias_mask = tf.fill(tf.convert_to_tensor(final_shape), True) @@ -1427,11 +1374,7 @@ def save_pretrained(self, save_directory, saved_model=False, version=1, push_to_ if saved_model: saved_model_dir = os.path.join(save_directory, "saved_model", str(version)) - self.save( - saved_model_dir, - include_optimizer=False, - signatures=self.serving, - ) + self.save(saved_model_dir, include_optimizer=False, signatures=self.serving) logger.info(f"Saved model created in {saved_model_dir}") # Save configuration file @@ -1583,11 +1526,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): from_pipeline = kwargs.pop("_from_pipeline", None) from_auto_class = kwargs.pop("_from_auto", False) - user_agent = { - "file_type": "model", - "framework": "tensorflow", - "from_auto_class": from_auto_class, - } + user_agent = {"file_type": "model", "framework": "tensorflow", "from_auto_class": from_auto_class} if from_pipeline is not None: user_agent["using_pipeline"] = from_pipeline @@ -1683,11 +1622,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): "proxies": proxies, "use_auth_token": use_auth_token, } - if has_file( - pretrained_model_name_or_path, - WEIGHTS_NAME, - **has_file_kwargs, - ): + if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs): raise EnvironmentError( f"{pretrained_model_name_or_path} does not appear to have a file named {TF2_WEIGHTS_NAME} " "but there is a file for PyTorch weights. Use `from_pt=True` to load this model from " @@ -1837,9 +1772,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): # To update the docstring, we need to copy the method, otherwise we change the original docstring. TFPreTrainedModel.push_to_hub = copy_func(TFPreTrainedModel.push_to_hub) TFPreTrainedModel.push_to_hub.__doc__ = TFPreTrainedModel.push_to_hub.__doc__.format( - object="model", - object_class="TFAutoModel", - object_files="model checkpoint", + object="model", object_class="TFAutoModel", object_files="model checkpoint" ) @@ -1868,9 +1801,7 @@ def __init__(self, nf, nx, initializer_range=0.02, **kwargs): def build(self, input_shape): self.weight = self.add_weight( - "weight", - shape=[self.nx, self.nf], - initializer=get_initializer(self.initializer_range), + "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range) ) self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer()) @@ -1916,9 +1847,7 @@ def build(self, input_shape): https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ self.weight = self.add_weight( - "weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), + "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range) ) super().build(input_shape) @@ -2032,9 +1961,7 @@ def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, ** else: num_classes = config.hidden_size self.summary = tf.keras.layers.Dense( - num_classes, - kernel_initializer=get_initializer(initializer_range), - name="summary", + num_classes, kernel_initializer=get_initializer(initializer_range), name="summary" ) self.has_activation = False @@ -2129,9 +2056,7 @@ def register_for_auto_class(cls, auto_class="TFAutoModel"): cls._auto_class = auto_class -def get_initializer( - initializer_range: float = 0.02, -) -> tf.initializers.TruncatedNormal: +def get_initializer(initializer_range: float = 0.02) -> tf.initializers.TruncatedNormal: """ Creates a `tf.initializers.TruncatedNormal` with the given range. diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 2038f29e56cf8..142bff7cae06e 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -83,8 +83,7 @@ # Restrict TensorFlow to only allocate x GB of memory on the GPUs try: tf.config.set_logical_device_configuration( - gpu, - [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)], + gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)] ) logical_gpus = tf.config.list_logical_devices("GPU") print("Logical GPUs", logical_gpus) @@ -117,10 +116,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): inputs_dict = { - k: tf.tile( - tf.expand_dims(v, 1), - (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1), - ) + k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1)) if isinstance(v, tf.Tensor) and v.ndim > 0 else v for k, v in inputs_dict.items() @@ -148,11 +144,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING), ]: inputs_dict["labels"] = tf.zeros( - ( - self.model_tester.batch_size, - self.model_tester.seq_length, - ), - dtype=tf.int32, + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32 ) return inputs_dict @@ -160,10 +152,7 @@ def test_initialization(self): pass def test_save_load(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -177,10 +166,7 @@ def test_save_load(self): self.assert_outputs_same(after_outputs, outputs) def test_save_load_config(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -232,10 +218,7 @@ def test_onnx_compliancy(self): if not self.test_onnx: return - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() INTERNAL_OPS = [ "Assert", "AssignVariableOp", @@ -282,10 +265,7 @@ def test_onnx_runtime_optimize(self): import onnxruntime import tf2onnx - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -296,10 +276,7 @@ def test_onnx_runtime_optimize(self): onnxruntime.InferenceSession(onnx_model_proto.SerializeToString()) def test_keras_save_load(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() tf_main_layer_classes = set( module_member @@ -344,8 +321,7 @@ def test_keras_save_load(self): ) else: model = tf.keras.models.load_model( - filepath, - custom_objects={main_layer_class.__name__: main_layer_class}, + filepath, custom_objects={main_layer_class.__name__: main_layer_class} ) assert isinstance(model, tf.keras.Model) after_outputs = model(inputs_dict) @@ -372,10 +348,7 @@ def test_pt_tf_model_equivalence(self): import transformers - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beginning @@ -388,9 +361,7 @@ def test_pt_tf_model_equivalence(self): # Check we can load pt model in tf and vice-versa with model => model functions tf_model = transformers.load_pytorch_model_in_tf2_model( - tf_model, - pt_model, - tf_inputs=self._prepare_for_class(inputs_dict, model_class), + tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class) ) pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) @@ -411,10 +382,7 @@ def test_pt_tf_model_equivalence(self): with torch.no_grad(): pto = pt_model(**pt_inputs_dict) - tfo = tf_model( - self._prepare_for_class(inputs_dict, model_class), - training=False, - ) + tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False) tf_hidden_states = tfo[0].numpy() pt_hidden_states = pto[0].numpy() @@ -473,20 +441,14 @@ def test_pt_tf_model_equivalence(self): self.assertLessEqual(max_diff, 4e-2) def test_compile_tf_model(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() max_input = getattr(self.model_tester, "max_position_embeddings", 512) optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") for model_class in self.all_model_classes: - if model_class.__name__ in [ - "TFSpeech2TextModel", - "TFSpeech2TextForConditionalGeneration", - ]: + if model_class.__name__ in ["TFSpeech2TextModel", "TFSpeech2TextForConditionalGeneration"]: inputs = { "decoder_input_ids": tf.keras.Input( batch_shape=(2, max_input), @@ -510,11 +472,7 @@ def test_compile_tf_model(self): name="decoder_input_ids", dtype="int32", ), - "input_ids": tf.keras.Input( - batch_shape=(2, max_input), - name="input_ids", - dtype="int32", - ), + "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"), } # `pixel_values` implies that the input is an image elif model_class.main_input_name == "pixel_values": @@ -530,11 +488,7 @@ def test_compile_tf_model(self): ) elif model_class.__name__ in ["TFCLIPModel"]: inputs = { - "input_ids": tf.keras.Input( - batch_shape=(3, max_input), - name="input_ids", - dtype="int32", - ), + "input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"), "pixel_values": tf.keras.Input( batch_shape=( 3, @@ -547,11 +501,7 @@ def test_compile_tf_model(self): ), } elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): - inputs = tf.keras.Input( - batch_shape=(4, 2, max_input), - name="input_ids", - dtype="int32", - ) + inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32") else: inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32") @@ -574,10 +524,7 @@ def test_compile_tf_model(self): extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) def test_keyword_and_dict_args(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -593,21 +540,10 @@ def test_keyword_and_dict_args(self): self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6) def test_attention_outputs(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True - decoder_seq_length = getattr( - self.model_tester, - "decoder_seq_length", - self.model_tester.seq_length, - ) - encoder_seq_length = getattr( - self.model_tester, - "encoder_seq_length", - self.model_tester.seq_length, - ) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length) + encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length) decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length) encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) @@ -618,11 +554,7 @@ def check_decoder_attentions_output(outputs): self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(decoder_attentions[0].shape[-3:]), - [ - self.model_tester.num_attention_heads, - decoder_seq_length, - decoder_key_length, - ], + [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], ) def check_encoder_attentions_output(outputs): @@ -632,11 +564,7 @@ def check_encoder_attentions_output(outputs): self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(attentions[0].shape[-3:]), - [ - self.model_tester.num_attention_heads, - encoder_seq_length, - encoder_key_length, - ], + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], ) for model_class in self.all_model_classes: @@ -678,10 +606,7 @@ def test_headmasking(self): return random.Random().seed(42) - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() random.Random().seed() inputs_dict["output_attentions"] = True @@ -694,19 +619,11 @@ def test_headmasking(self): def prepare_layer_head_mask(i, attention_heads, num_hidden_layers): if i == 0: return tf.concat( - ( - tf.zeros(1, dtype=tf.float32), - tf.ones(attention_heads - 1, dtype=tf.float32), - ), - 0, + (tf.zeros(1, dtype=tf.float32), tf.ones(attention_heads - 1, dtype=tf.float32)), 0 ) elif i == num_hidden_layers - 1: return tf.concat( - ( - tf.zeros(attention_heads - 1, dtype=tf.float32), - tf.ones(1, dtype=tf.float32), - ), - 0, + (tf.zeros(attention_heads - 1, dtype=tf.float32), tf.ones(1, dtype=tf.float32)), 0 ) else: return tf.ones(attention_heads, dtype=tf.float32) @@ -735,8 +652,7 @@ def check_attentions_validity(attentions): # Remove Nan for t in attentions: self.assertLess( - (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), - (tf.size(t) / 4).numpy(), + (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), (tf.size(t) / 4).numpy() ) # Check we don't have more than 25% nans (arbitrary) attentions = [ @@ -744,23 +660,11 @@ def check_attentions_validity(attentions): ] # remove them (the test is less complete) self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0) - self.assertNotEqual( - tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), - 0.0, - ) + self.assertNotEqual(tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), 0.0) if len(attentions) > 2: # encoder-decodere models have only 2 layers in each modules - self.assertNotEqual( - tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), - 0.0, - ) - self.assertAlmostEqual( - tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), - 0.0, - ) - self.assertNotEqual( - tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), - 0.0, - ) + self.assertNotEqual(tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), 0.0) + self.assertAlmostEqual(tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), 0.0) + self.assertNotEqual(tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), 0.0) if model.config.is_encoder_decoder: check_attentions_validity(outputs.encoder_attentions) @@ -771,18 +675,13 @@ def check_attentions_validity(attentions): check_attentions_validity(outputs.attentions) def test_hidden_states_output(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def check_hidden_states_output(config, inputs_dict, model_class): model = model_class(config) outputs = model(self._prepare_for_class(inputs_dict, model_class)) expected_num_layers = getattr( - self.model_tester, - "expected_num_hidden_layers", - self.model_tester.num_hidden_layers + 1, + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 ) if model.config.is_encoder_decoder: @@ -793,18 +692,12 @@ def check_hidden_states_output(config, inputs_dict, model_class): self.assertEqual(len(encoder_hidden_states), expected_num_layers) self.assertListEqual( list(encoder_hidden_states[0].shape[-2:]), - [ - self.model_tester.seq_length, - self.model_tester.hidden_size, - ], + [self.model_tester.seq_length, self.model_tester.hidden_size], ) self.assertEqual(len(decoder_hidden_states), expected_num_layers) self.assertListEqual( list(decoder_hidden_states[0].shape[-2:]), - [ - self.model_tester.seq_length, - self.model_tester.hidden_size, - ], + [self.model_tester.seq_length, self.model_tester.hidden_size], ) else: hidden_states = outputs.hidden_states @@ -812,10 +705,7 @@ def check_hidden_states_output(config, inputs_dict, model_class): self.assertEqual(len(hidden_states), expected_num_layers) self.assertListEqual( list(hidden_states[0].shape[-2:]), - [ - self.model_tester.seq_length, - self.model_tester.hidden_size, - ], + [self.model_tester.seq_length, self.model_tester.hidden_size], ) for model_class in self.all_model_classes: @@ -827,10 +717,7 @@ def check_hidden_states_output(config, inputs_dict, model_class): check_hidden_states_output(config, inputs_dict, model_class) def test_model_common_attributes(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() text_in_text_out_models = ( get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING) + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING) @@ -860,22 +747,13 @@ def test_model_common_attributes(self): assert name is None def test_determinism(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) first, second = ( - model( - self._prepare_for_class(inputs_dict, model_class), - training=False, - )[0], - model( - self._prepare_for_class(inputs_dict, model_class), - training=False, - )[0], + model(self._prepare_for_class(inputs_dict, model_class), training=False)[0], + model(self._prepare_for_class(inputs_dict, model_class), training=False)[0], ) out_1 = first.numpy() out_2 = second.numpy() @@ -886,10 +764,7 @@ def test_determinism(self): def test_model_outputs_equivalence(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs) @@ -939,17 +814,11 @@ def recursive_check(tuple_object, dict_object): tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence( - model, - tuple_inputs, - dict_inputs, - {"output_hidden_states": True, "output_attentions": True}, + model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} ) def test_inputs_embeds(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -976,10 +845,7 @@ def test_inputs_embeds(self): model(inputs) def test_numpy_arrays_inputs(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def prepare_numpy_arrays(inputs_dict): inputs_np_dict = {} @@ -1004,10 +870,7 @@ def prepare_numpy_arrays(inputs_dict): def test_resize_token_embeddings(self): if not self.test_resize_embeddings: return - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def _get_word_embedding_weight(model, embedding_layer): embeds = getattr(embedding_layer, "weight", None) @@ -1066,25 +929,16 @@ def _get_word_embedding_weight(model, embedding_layer): if old_output_embeddings is not None and new_output_embeddings is not None: self.assertEqual(new_output_embeddings.shape[0], assert_size) - self.assertEqual( - new_output_embeddings.shape[1], - old_output_embeddings.shape[1], - ) + self.assertEqual(new_output_embeddings.shape[1], old_output_embeddings.shape[1]) models_equal = True - for p1, p2 in zip( - old_output_embeddings.value(), - new_output_embeddings.value(), - ): + for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()): if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: models_equal = False self.assertTrue(models_equal) def test_lm_head_model_random_no_beam_search_generate(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) # iterate over all generative models @@ -1111,25 +965,16 @@ def test_lm_head_model_random_no_beam_search_generate(self): # check bad words tokens language generation # create list of 1-seq bad token and list of 2-seq of bad tokens - bad_words_ids = [ - self._generate_random_bad_tokens(1, model), - self._generate_random_bad_tokens(2, model), - ] + bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)] output_tokens = model.generate( - input_ids, - do_sample=True, - bad_words_ids=bad_words_ids, - num_return_sequences=2, + input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2 ) # only count generated tokens generated_ids = output_tokens[:, input_ids.shape[-1] :] self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) def test_lm_head_model_no_beam_search_generate_dict_outputs(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) if input_ids is None: input_ids = inputs_dict.get("input_features", None) @@ -1162,10 +1007,7 @@ def test_lm_head_model_no_beam_search_generate_dict_outputs(self): self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput) def test_lm_head_model_random_beam_search_generate(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) for model_class in self.all_generative_model_classes: @@ -1180,12 +1022,7 @@ def test_lm_head_model_random_beam_search_generate(self): with self.assertRaises(AssertionError): # generating more sequences than having beams leads is not possible - model.generate( - input_ids, - do_sample=False, - num_return_sequences=3, - num_beams=2, - ) + model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2) # num_return_sequences > 1, sample self._check_generated_ids( @@ -1197,37 +1034,20 @@ def test_lm_head_model_random_beam_search_generate(self): ) ) # num_return_sequences > 1, greedy - self._check_generated_ids( - model.generate( - input_ids, - do_sample=False, - num_beams=2, - num_return_sequences=2, - ) - ) + self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2)) # check bad words tokens language generation # create list of 1-seq bad token and list of 2-seq of bad tokens - bad_words_ids = [ - self._generate_random_bad_tokens(1, model), - self._generate_random_bad_tokens(2, model), - ] + bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)] output_tokens = model.generate( - input_ids, - do_sample=False, - bad_words_ids=bad_words_ids, - num_beams=2, - num_return_sequences=2, + input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2 ) # only count generated tokens generated_ids = output_tokens[:, input_ids.shape[-1] :] self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) def test_lm_head_model_beam_search_generate_dict_outputs(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) if input_ids is None: input_ids = inputs_dict.get("input_features", None) @@ -1262,20 +1082,14 @@ def test_lm_head_model_beam_search_generate_dict_outputs(self): self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput) def test_loss_computation(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) if getattr(model, "hf_compute_loss", None): # The number of elements in the loss should be the same as the number of elements in the label prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) added_label = prepared_for_class[ - sorted( - list(prepared_for_class.keys() - inputs_dict.keys()), - reverse=True, - )[0] + sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0] ] loss_size = tf.size(added_label) @@ -1286,11 +1100,7 @@ def test_loss_computation(self): # Test that model correctly compute the loss with kwargs prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) - possible_input_names = { - "input_ids", - "pixel_values", - "input_features", - } + possible_input_names = {"input_ids", "pixel_values", "input_features"} input_name = possible_input_names.intersection(set(prepared_for_class)).pop() model_input = prepared_for_class.pop(input_name) @@ -1334,15 +1144,8 @@ def test_loss_computation(self): self.assertEqual(loss.shape, [loss_size]) def test_generate_with_headmasking(self): - attention_names = [ - "encoder_attentions", - "decoder_attentions", - "cross_attentions", - ] - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"] + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_generative_model_classes: model = model_class(config) @@ -1377,10 +1180,7 @@ def test_generate_with_headmasking(self): def test_load_with_mismatched_shapes(self): if not self.test_mismatched_shapes: return - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): @@ -1487,13 +1287,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): def random_attention_mask(shape, rng=None, name=None, dtype=None): attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype) # make sure that at least one token is attended to for each batch - attn_mask = tf.concat( - [ - tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), - attn_mask[:, 1:], - ], - axis=1, - ) + attn_mask = tf.concat([tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), attn_mask[:, 1:]], axis=1) return attn_mask @@ -1510,10 +1304,7 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None): for _ in range(total_dims): values.append(rng.random() * scale) - return tf.reshape( - tf.constant(values, dtype=dtype if dtype is not None else tf.float32), - shape=shape, - ) + return tf.reshape(tf.constant(values, dtype=dtype if dtype is not None else tf.float32), shape=shape) @require_tf @@ -1592,34 +1383,12 @@ def test_top_k_top_p_filtering(self): ) non_inf_expected_idx = tf.convert_to_tensor( - [ - [0, 0], - [0, 9], - [0, 10], - [0, 25], - [0, 26], - [1, 13], - [1, 17], - [1, 18], - [1, 20], - [1, 27], - ], + [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]], dtype=tf.int32, ) # expected non filtered idx as noted above non_inf_expected_output = tf.convert_to_tensor( - [ - 8.222099, - 7.3534126, - 8.432078, - 7.4402075, - 9.38451, - 6.271159, - 8.827531, - 5.4402995, - 7.3857956, - 9.677023, - ], + [8.222099, 7.3534126, 8.432078, 7.4402075, 9.38451, 6.271159, 8.827531, 5.4402995, 7.3857956, 9.677023], dtype=tf.float32, ) # expected non filtered values as noted above @@ -1650,31 +1419,19 @@ def tearDownClass(cls): pass try: - delete_repo( - token=cls._token, - name="test-model-tf-org", - organization="valid_org", - ) + delete_repo(token=cls._token, name="test-model-tf-org", organization="valid_org") except HTTPError: pass def test_push_to_hub(self): config = BertConfig( - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, + vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 ) model = TFBertModel(config) # Make sure model is properly initialized _ = model(model.dummy_inputs) with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained( - os.path.join(tmp_dir, "test-model-tf"), - push_to_hub=True, - use_auth_token=self._token, - ) + model.save_pretrained(os.path.join(tmp_dir, "test-model-tf"), push_to_hub=True, use_auth_token=self._token) new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf") models_equal = True @@ -1685,11 +1442,7 @@ def test_push_to_hub(self): def test_push_to_hub_with_model_card(self): config = BertConfig( - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, + vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 ) model = TFBertModel(config) with tempfile.TemporaryDirectory() as tmp_dir: @@ -1698,11 +1451,7 @@ def test_push_to_hub_with_model_card(self): def test_push_to_hub_in_organization(self): config = BertConfig( - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, + vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 ) model = TFBertModel(config) with tempfile.TemporaryDirectory() as tmp_dir: From f4292b45353051910aeb75c6767e9f2335f2706c Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 25 Feb 2022 07:49:33 +0530 Subject: [PATCH 59/65] fix: tests for convnext. --- tests/test_modeling_tf_common.py | 395 +++++++++++++++++++++++++------ 1 file changed, 323 insertions(+), 72 deletions(-) diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 142bff7cae06e..2038f29e56cf8 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -83,7 +83,8 @@ # Restrict TensorFlow to only allocate x GB of memory on the GPUs try: tf.config.set_logical_device_configuration( - gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)] + gpu, + [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)], ) logical_gpus = tf.config.list_logical_devices("GPU") print("Logical GPUs", logical_gpus) @@ -116,7 +117,10 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): inputs_dict = { - k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1)) + k: tf.tile( + tf.expand_dims(v, 1), + (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1), + ) if isinstance(v, tf.Tensor) and v.ndim > 0 else v for k, v in inputs_dict.items() @@ -144,7 +148,11 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING), ]: inputs_dict["labels"] = tf.zeros( - (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32 + ( + self.model_tester.batch_size, + self.model_tester.seq_length, + ), + dtype=tf.int32, ) return inputs_dict @@ -152,7 +160,10 @@ def test_initialization(self): pass def test_save_load(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -166,7 +177,10 @@ def test_save_load(self): self.assert_outputs_same(after_outputs, outputs) def test_save_load_config(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -218,7 +232,10 @@ def test_onnx_compliancy(self): if not self.test_onnx: return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() INTERNAL_OPS = [ "Assert", "AssignVariableOp", @@ -265,7 +282,10 @@ def test_onnx_runtime_optimize(self): import onnxruntime import tf2onnx - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -276,7 +296,10 @@ def test_onnx_runtime_optimize(self): onnxruntime.InferenceSession(onnx_model_proto.SerializeToString()) def test_keras_save_load(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() tf_main_layer_classes = set( module_member @@ -321,7 +344,8 @@ def test_keras_save_load(self): ) else: model = tf.keras.models.load_model( - filepath, custom_objects={main_layer_class.__name__: main_layer_class} + filepath, + custom_objects={main_layer_class.__name__: main_layer_class}, ) assert isinstance(model, tf.keras.Model) after_outputs = model(inputs_dict) @@ -348,7 +372,10 @@ def test_pt_tf_model_equivalence(self): import transformers - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beginning @@ -361,7 +388,9 @@ def test_pt_tf_model_equivalence(self): # Check we can load pt model in tf and vice-versa with model => model functions tf_model = transformers.load_pytorch_model_in_tf2_model( - tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class) + tf_model, + pt_model, + tf_inputs=self._prepare_for_class(inputs_dict, model_class), ) pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) @@ -382,7 +411,10 @@ def test_pt_tf_model_equivalence(self): with torch.no_grad(): pto = pt_model(**pt_inputs_dict) - tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False) + tfo = tf_model( + self._prepare_for_class(inputs_dict, model_class), + training=False, + ) tf_hidden_states = tfo[0].numpy() pt_hidden_states = pto[0].numpy() @@ -441,14 +473,20 @@ def test_pt_tf_model_equivalence(self): self.assertLessEqual(max_diff, 4e-2) def test_compile_tf_model(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() max_input = getattr(self.model_tester, "max_position_embeddings", 512) optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") for model_class in self.all_model_classes: - if model_class.__name__ in ["TFSpeech2TextModel", "TFSpeech2TextForConditionalGeneration"]: + if model_class.__name__ in [ + "TFSpeech2TextModel", + "TFSpeech2TextForConditionalGeneration", + ]: inputs = { "decoder_input_ids": tf.keras.Input( batch_shape=(2, max_input), @@ -472,7 +510,11 @@ def test_compile_tf_model(self): name="decoder_input_ids", dtype="int32", ), - "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"), + "input_ids": tf.keras.Input( + batch_shape=(2, max_input), + name="input_ids", + dtype="int32", + ), } # `pixel_values` implies that the input is an image elif model_class.main_input_name == "pixel_values": @@ -488,7 +530,11 @@ def test_compile_tf_model(self): ) elif model_class.__name__ in ["TFCLIPModel"]: inputs = { - "input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"), + "input_ids": tf.keras.Input( + batch_shape=(3, max_input), + name="input_ids", + dtype="int32", + ), "pixel_values": tf.keras.Input( batch_shape=( 3, @@ -501,7 +547,11 @@ def test_compile_tf_model(self): ), } elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): - inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32") + inputs = tf.keras.Input( + batch_shape=(4, 2, max_input), + name="input_ids", + dtype="int32", + ) else: inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32") @@ -524,7 +574,10 @@ def test_compile_tf_model(self): extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) def test_keyword_and_dict_args(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -540,10 +593,21 @@ def test_keyword_and_dict_args(self): self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6) def test_attention_outputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True - decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length) - encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length) + decoder_seq_length = getattr( + self.model_tester, + "decoder_seq_length", + self.model_tester.seq_length, + ) + encoder_seq_length = getattr( + self.model_tester, + "encoder_seq_length", + self.model_tester.seq_length, + ) decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length) encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) @@ -554,7 +618,11 @@ def check_decoder_attentions_output(outputs): self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(decoder_attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + decoder_key_length, + ], ) def check_encoder_attentions_output(outputs): @@ -564,7 +632,11 @@ def check_encoder_attentions_output(outputs): self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + [ + self.model_tester.num_attention_heads, + encoder_seq_length, + encoder_key_length, + ], ) for model_class in self.all_model_classes: @@ -606,7 +678,10 @@ def test_headmasking(self): return random.Random().seed(42) - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() random.Random().seed() inputs_dict["output_attentions"] = True @@ -619,11 +694,19 @@ def test_headmasking(self): def prepare_layer_head_mask(i, attention_heads, num_hidden_layers): if i == 0: return tf.concat( - (tf.zeros(1, dtype=tf.float32), tf.ones(attention_heads - 1, dtype=tf.float32)), 0 + ( + tf.zeros(1, dtype=tf.float32), + tf.ones(attention_heads - 1, dtype=tf.float32), + ), + 0, ) elif i == num_hidden_layers - 1: return tf.concat( - (tf.zeros(attention_heads - 1, dtype=tf.float32), tf.ones(1, dtype=tf.float32)), 0 + ( + tf.zeros(attention_heads - 1, dtype=tf.float32), + tf.ones(1, dtype=tf.float32), + ), + 0, ) else: return tf.ones(attention_heads, dtype=tf.float32) @@ -652,7 +735,8 @@ def check_attentions_validity(attentions): # Remove Nan for t in attentions: self.assertLess( - (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), (tf.size(t) / 4).numpy() + (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), + (tf.size(t) / 4).numpy(), ) # Check we don't have more than 25% nans (arbitrary) attentions = [ @@ -660,11 +744,23 @@ def check_attentions_validity(attentions): ] # remove them (the test is less complete) self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0) - self.assertNotEqual(tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), 0.0) + self.assertNotEqual( + tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), + 0.0, + ) if len(attentions) > 2: # encoder-decodere models have only 2 layers in each modules - self.assertNotEqual(tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), 0.0) - self.assertAlmostEqual(tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), 0.0) - self.assertNotEqual(tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), 0.0) + self.assertNotEqual( + tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), + 0.0, + ) + self.assertAlmostEqual( + tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), + 0.0, + ) + self.assertNotEqual( + tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), + 0.0, + ) if model.config.is_encoder_decoder: check_attentions_validity(outputs.encoder_attentions) @@ -675,13 +771,18 @@ def check_attentions_validity(attentions): check_attentions_validity(outputs.attentions) def test_hidden_states_output(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() def check_hidden_states_output(config, inputs_dict, model_class): model = model_class(config) outputs = model(self._prepare_for_class(inputs_dict, model_class)) expected_num_layers = getattr( - self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + self.model_tester, + "expected_num_hidden_layers", + self.model_tester.num_hidden_layers + 1, ) if model.config.is_encoder_decoder: @@ -692,12 +793,18 @@ def check_hidden_states_output(config, inputs_dict, model_class): self.assertEqual(len(encoder_hidden_states), expected_num_layers) self.assertListEqual( list(encoder_hidden_states[0].shape[-2:]), - [self.model_tester.seq_length, self.model_tester.hidden_size], + [ + self.model_tester.seq_length, + self.model_tester.hidden_size, + ], ) self.assertEqual(len(decoder_hidden_states), expected_num_layers) self.assertListEqual( list(decoder_hidden_states[0].shape[-2:]), - [self.model_tester.seq_length, self.model_tester.hidden_size], + [ + self.model_tester.seq_length, + self.model_tester.hidden_size, + ], ) else: hidden_states = outputs.hidden_states @@ -705,7 +812,10 @@ def check_hidden_states_output(config, inputs_dict, model_class): self.assertEqual(len(hidden_states), expected_num_layers) self.assertListEqual( list(hidden_states[0].shape[-2:]), - [self.model_tester.seq_length, self.model_tester.hidden_size], + [ + self.model_tester.seq_length, + self.model_tester.hidden_size, + ], ) for model_class in self.all_model_classes: @@ -717,7 +827,10 @@ def check_hidden_states_output(config, inputs_dict, model_class): check_hidden_states_output(config, inputs_dict, model_class) def test_model_common_attributes(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() text_in_text_out_models = ( get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING) + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING) @@ -747,13 +860,22 @@ def test_model_common_attributes(self): assert name is None def test_determinism(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) first, second = ( - model(self._prepare_for_class(inputs_dict, model_class), training=False)[0], - model(self._prepare_for_class(inputs_dict, model_class), training=False)[0], + model( + self._prepare_for_class(inputs_dict, model_class), + training=False, + )[0], + model( + self._prepare_for_class(inputs_dict, model_class), + training=False, + )[0], ) out_1 = first.numpy() out_2 = second.numpy() @@ -764,7 +886,10 @@ def test_determinism(self): def test_model_outputs_equivalence(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs) @@ -814,11 +939,17 @@ def recursive_check(tuple_object, dict_object): tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence( - model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} + model, + tuple_inputs, + dict_inputs, + {"output_hidden_states": True, "output_attentions": True}, ) def test_inputs_embeds(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -845,7 +976,10 @@ def test_inputs_embeds(self): model(inputs) def test_numpy_arrays_inputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() def prepare_numpy_arrays(inputs_dict): inputs_np_dict = {} @@ -870,7 +1004,10 @@ def prepare_numpy_arrays(inputs_dict): def test_resize_token_embeddings(self): if not self.test_resize_embeddings: return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() def _get_word_embedding_weight(model, embedding_layer): embeds = getattr(embedding_layer, "weight", None) @@ -929,16 +1066,25 @@ def _get_word_embedding_weight(model, embedding_layer): if old_output_embeddings is not None and new_output_embeddings is not None: self.assertEqual(new_output_embeddings.shape[0], assert_size) - self.assertEqual(new_output_embeddings.shape[1], old_output_embeddings.shape[1]) + self.assertEqual( + new_output_embeddings.shape[1], + old_output_embeddings.shape[1], + ) models_equal = True - for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()): + for p1, p2 in zip( + old_output_embeddings.value(), + new_output_embeddings.value(), + ): if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: models_equal = False self.assertTrue(models_equal) def test_lm_head_model_random_no_beam_search_generate(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) # iterate over all generative models @@ -965,16 +1111,25 @@ def test_lm_head_model_random_no_beam_search_generate(self): # check bad words tokens language generation # create list of 1-seq bad token and list of 2-seq of bad tokens - bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)] + bad_words_ids = [ + self._generate_random_bad_tokens(1, model), + self._generate_random_bad_tokens(2, model), + ] output_tokens = model.generate( - input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2 + input_ids, + do_sample=True, + bad_words_ids=bad_words_ids, + num_return_sequences=2, ) # only count generated tokens generated_ids = output_tokens[:, input_ids.shape[-1] :] self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) def test_lm_head_model_no_beam_search_generate_dict_outputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) if input_ids is None: input_ids = inputs_dict.get("input_features", None) @@ -1007,7 +1162,10 @@ def test_lm_head_model_no_beam_search_generate_dict_outputs(self): self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput) def test_lm_head_model_random_beam_search_generate(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) for model_class in self.all_generative_model_classes: @@ -1022,7 +1180,12 @@ def test_lm_head_model_random_beam_search_generate(self): with self.assertRaises(AssertionError): # generating more sequences than having beams leads is not possible - model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2) + model.generate( + input_ids, + do_sample=False, + num_return_sequences=3, + num_beams=2, + ) # num_return_sequences > 1, sample self._check_generated_ids( @@ -1034,20 +1197,37 @@ def test_lm_head_model_random_beam_search_generate(self): ) ) # num_return_sequences > 1, greedy - self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2)) + self._check_generated_ids( + model.generate( + input_ids, + do_sample=False, + num_beams=2, + num_return_sequences=2, + ) + ) # check bad words tokens language generation # create list of 1-seq bad token and list of 2-seq of bad tokens - bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)] + bad_words_ids = [ + self._generate_random_bad_tokens(1, model), + self._generate_random_bad_tokens(2, model), + ] output_tokens = model.generate( - input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2 + input_ids, + do_sample=False, + bad_words_ids=bad_words_ids, + num_beams=2, + num_return_sequences=2, ) # only count generated tokens generated_ids = output_tokens[:, input_ids.shape[-1] :] self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) def test_lm_head_model_beam_search_generate_dict_outputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) if input_ids is None: input_ids = inputs_dict.get("input_features", None) @@ -1082,14 +1262,20 @@ def test_lm_head_model_beam_search_generate_dict_outputs(self): self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput) def test_loss_computation(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) if getattr(model, "hf_compute_loss", None): # The number of elements in the loss should be the same as the number of elements in the label prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) added_label = prepared_for_class[ - sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0] + sorted( + list(prepared_for_class.keys() - inputs_dict.keys()), + reverse=True, + )[0] ] loss_size = tf.size(added_label) @@ -1100,7 +1286,11 @@ def test_loss_computation(self): # Test that model correctly compute the loss with kwargs prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) - possible_input_names = {"input_ids", "pixel_values", "input_features"} + possible_input_names = { + "input_ids", + "pixel_values", + "input_features", + } input_name = possible_input_names.intersection(set(prepared_for_class)).pop() model_input = prepared_for_class.pop(input_name) @@ -1144,8 +1334,15 @@ def test_loss_computation(self): self.assertEqual(loss.shape, [loss_size]) def test_generate_with_headmasking(self): - attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"] - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + attention_names = [ + "encoder_attentions", + "decoder_attentions", + "cross_attentions", + ] + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_generative_model_classes: model = model_class(config) @@ -1180,7 +1377,10 @@ def test_generate_with_headmasking(self): def test_load_with_mismatched_shapes(self): if not self.test_mismatched_shapes: return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + ( + config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): @@ -1287,7 +1487,13 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): def random_attention_mask(shape, rng=None, name=None, dtype=None): attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype) # make sure that at least one token is attended to for each batch - attn_mask = tf.concat([tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), attn_mask[:, 1:]], axis=1) + attn_mask = tf.concat( + [ + tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), + attn_mask[:, 1:], + ], + axis=1, + ) return attn_mask @@ -1304,7 +1510,10 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None): for _ in range(total_dims): values.append(rng.random() * scale) - return tf.reshape(tf.constant(values, dtype=dtype if dtype is not None else tf.float32), shape=shape) + return tf.reshape( + tf.constant(values, dtype=dtype if dtype is not None else tf.float32), + shape=shape, + ) @require_tf @@ -1383,12 +1592,34 @@ def test_top_k_top_p_filtering(self): ) non_inf_expected_idx = tf.convert_to_tensor( - [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]], + [ + [0, 0], + [0, 9], + [0, 10], + [0, 25], + [0, 26], + [1, 13], + [1, 17], + [1, 18], + [1, 20], + [1, 27], + ], dtype=tf.int32, ) # expected non filtered idx as noted above non_inf_expected_output = tf.convert_to_tensor( - [8.222099, 7.3534126, 8.432078, 7.4402075, 9.38451, 6.271159, 8.827531, 5.4402995, 7.3857956, 9.677023], + [ + 8.222099, + 7.3534126, + 8.432078, + 7.4402075, + 9.38451, + 6.271159, + 8.827531, + 5.4402995, + 7.3857956, + 9.677023, + ], dtype=tf.float32, ) # expected non filtered values as noted above @@ -1419,19 +1650,31 @@ def tearDownClass(cls): pass try: - delete_repo(token=cls._token, name="test-model-tf-org", organization="valid_org") + delete_repo( + token=cls._token, + name="test-model-tf-org", + organization="valid_org", + ) except HTTPError: pass def test_push_to_hub(self): config = BertConfig( - vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, ) model = TFBertModel(config) # Make sure model is properly initialized _ = model(model.dummy_inputs) with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained(os.path.join(tmp_dir, "test-model-tf"), push_to_hub=True, use_auth_token=self._token) + model.save_pretrained( + os.path.join(tmp_dir, "test-model-tf"), + push_to_hub=True, + use_auth_token=self._token, + ) new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf") models_equal = True @@ -1442,7 +1685,11 @@ def test_push_to_hub(self): def test_push_to_hub_with_model_card(self): config = BertConfig( - vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, ) model = TFBertModel(config) with tempfile.TemporaryDirectory() as tmp_dir: @@ -1451,7 +1698,11 @@ def test_push_to_hub_with_model_card(self): def test_push_to_hub_in_organization(self): config = BertConfig( - vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, ) model = TFBertModel(config) with tempfile.TemporaryDirectory() as tmp_dir: From 8b99c8e63f05469e1e33d856ad185c7613c111ad Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 25 Feb 2022 13:26:32 +0530 Subject: [PATCH 60/65] chore: removed output_attentions argument from convnext config. --- src/transformers/modeling_tf_utils.py | 127 ++++++++++++++---- .../models/convnext/configuration_convnext.py | 1 - 2 files changed, 101 insertions(+), 27 deletions(-) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 8d2ad8d10c081..9d392ec6e4ff0 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -312,9 +312,10 @@ def booleans_processing(config, **kwargs): if tf.executing_eagerly(): # Pure conv models (such as ConvNext) do not have `output_attentions` - final_booleans["output_attentions"] = ( - kwargs["output_attentions"] if kwargs["output_attentions"] is not None else config.output_attentions - ) + final_booleans["output_attentions"] = kwargs.get("output_attentions", None) + if final_booleans["output_attentions"] is None: + final_booleans["output_attentions"] = config.output_attentions + final_booleans["output_hidden_states"] = ( kwargs["output_hidden_states"] if kwargs["output_hidden_states"] is not None @@ -366,7 +367,17 @@ def input_processing(func, config, input_ids, **kwargs): signature.pop("self", None) parameter_names = list(signature.keys()) output = {} - allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray, KerasTensor) + allowed_types = ( + tf.Tensor, + bool, + int, + ModelOutput, + tuple, + list, + dict, + np.ndarray, + KerasTensor, + ) if "inputs" in kwargs["kwargs_call"]: warnings.warn( @@ -479,7 +490,13 @@ def input_processing(func, config, input_ids, **kwargs): boolean_dict = { k: v for k, v in output.items() - if k in ["return_dict", "output_attentions", "output_hidden_states", "use_cache"] + if k + in [ + "return_dict", + "output_attentions", + "output_hidden_states", + "use_cache", + ] } output.update( @@ -578,11 +595,18 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False, # If yes we reshape the weight from the H5 file accordingly to the current weight # If the two shapes are not compatible we raise an issue try: - array = np.reshape(saved_weight_value, K.int_shape(symbolic_weight)) + array = np.reshape( + saved_weight_value, + K.int_shape(symbolic_weight), + ) except ValueError as e: if ignore_mismatched_sizes: mismatched_layers.append( - (symbolic_weight_name, saved_weight_value.shape, K.int_shape(symbolic_weight)) + ( + symbolic_weight_name, + saved_weight_value.shape, + K.int_shape(symbolic_weight), + ) ) continue else: @@ -626,11 +650,17 @@ def init_copy_embeddings(old_embeddings, new_num_tokens): # and we create a mask to properly identify the padded values and be replaced by the values of the newly created # embeddings current_weights = tf.pad( - old_embeddings.value(), tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=-1 + old_embeddings.value(), + tf.convert_to_tensor([[0, size_diff], [0, 0]]), + constant_values=-1, ) num_tokens_to_copy = min(old_num_tokens, new_num_tokens) mask = tf.fill(tf.convert_to_tensor([num_tokens_to_copy, 1]), True) - mask = tf.pad(mask, tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=False) + mask = tf.pad( + mask, + tf.convert_to_tensor([[0, size_diff], [0, 0]]), + constant_values=False, + ) else: # if the new size if lower than the old one, we take the current embeddings until the new size current_weights = tf.slice( @@ -775,7 +805,10 @@ def _save_checkpoint(self, checkpoint_dir, epoch): # internally and which users are likely to use too weights_path = os.path.join(checkpoint_dir, "weights.h5") self.save_weights(weights_path) - extra_data = {"epoch": epoch, "optimizer_state": self.optimizer.get_weights()} + extra_data = { + "epoch": epoch, + "optimizer_state": self.optimizer.get_weights(), + } extra_data_path = os.path.join(checkpoint_dir, "extra_data.pickle") with open(extra_data_path, "wb") as f: pickle.dump(extra_data, f) @@ -801,7 +834,10 @@ def load_repo_checkpoint(self, repo_path_or_name): if not os.path.isdir(repo_path_or_name): # If this isn't a local path, check that the remote repo exists and has a checkpoint in it repo_files = list_repo_files(repo_path_or_name) - for file in ("checkpoint/weights.h5", "checkpoint/extra_data.pickle"): + for file in ( + "checkpoint/weights.h5", + "checkpoint/extra_data.pickle", + ): if file not in repo_files: raise FileNotFoundError(f"Repo {repo_path_or_name} does not contain checkpoint file {file}!") if "/" not in repo_path_or_name: @@ -809,7 +845,10 @@ def load_repo_checkpoint(self, repo_path_or_name): repo_path_or_name = self.get_full_repo_name(repo_path_or_name) else: model_id = repo_path_or_name.split("/")[-1] - repo = Repository(model_id, clone_from=f"https://huggingface.co/{repo_path_or_name}") + repo = Repository( + model_id, + clone_from=f"https://huggingface.co/{repo_path_or_name}", + ) local_dir = repo.local_dir else: local_dir = repo_path_or_name @@ -1066,7 +1105,8 @@ def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]: `tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model. """ warnings.warn( - "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning + "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", + FutureWarning, ) return self.get_lm_head() @@ -1077,7 +1117,10 @@ def get_prefix_bias_name(self) -> Union[None, str]: Return: `str`: The _prefix name of the bias. """ - warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + warnings.warn( + "The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", + FutureWarning, + ) return None def get_bias(self) -> Union[None, Dict[str, tf.Variable]]: @@ -1225,15 +1268,25 @@ def _get_resized_lm_head_bias(self, old_lm_head_bias, new_num_tokens): # initialize new bias if tf.math.greater(size_diff, 0): padding_shape = [[0, size_diff]] if first_dim is None else [[0, 0], [0, size_diff]] - current_bias = tf.pad(weight.value(), tf.convert_to_tensor(padding_shape), constant_values=-1) + current_bias = tf.pad( + weight.value(), + tf.convert_to_tensor(padding_shape), + constant_values=-1, + ) num_tokens_to_copy = min(old_num_tokens, new_num_tokens) mask_shape = [num_tokens_to_copy] if first_dim is None else [1, num_tokens_to_copy] bias_mask = tf.fill(tf.convert_to_tensor(mask_shape), True) - bias_mask = tf.pad(bias_mask, tf.convert_to_tensor(padding_shape), constant_values=False) + bias_mask = tf.pad( + bias_mask, + tf.convert_to_tensor(padding_shape), + constant_values=False, + ) else: slice_from = [0] if first_dim is None else [0, 0] current_bias = tf.slice( - weight.value(), tf.convert_to_tensor(slice_from), tf.convert_to_tensor(final_shape) + weight.value(), + tf.convert_to_tensor(slice_from), + tf.convert_to_tensor(final_shape), ) bias_mask = tf.fill(tf.convert_to_tensor(final_shape), True) @@ -1374,7 +1427,11 @@ def save_pretrained(self, save_directory, saved_model=False, version=1, push_to_ if saved_model: saved_model_dir = os.path.join(save_directory, "saved_model", str(version)) - self.save(saved_model_dir, include_optimizer=False, signatures=self.serving) + self.save( + saved_model_dir, + include_optimizer=False, + signatures=self.serving, + ) logger.info(f"Saved model created in {saved_model_dir}") # Save configuration file @@ -1526,7 +1583,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): from_pipeline = kwargs.pop("_from_pipeline", None) from_auto_class = kwargs.pop("_from_auto", False) - user_agent = {"file_type": "model", "framework": "tensorflow", "from_auto_class": from_auto_class} + user_agent = { + "file_type": "model", + "framework": "tensorflow", + "from_auto_class": from_auto_class, + } if from_pipeline is not None: user_agent["using_pipeline"] = from_pipeline @@ -1622,7 +1683,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): "proxies": proxies, "use_auth_token": use_auth_token, } - if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs): + if has_file( + pretrained_model_name_or_path, + WEIGHTS_NAME, + **has_file_kwargs, + ): raise EnvironmentError( f"{pretrained_model_name_or_path} does not appear to have a file named {TF2_WEIGHTS_NAME} " "but there is a file for PyTorch weights. Use `from_pt=True` to load this model from " @@ -1772,7 +1837,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): # To update the docstring, we need to copy the method, otherwise we change the original docstring. TFPreTrainedModel.push_to_hub = copy_func(TFPreTrainedModel.push_to_hub) TFPreTrainedModel.push_to_hub.__doc__ = TFPreTrainedModel.push_to_hub.__doc__.format( - object="model", object_class="TFAutoModel", object_files="model checkpoint" + object="model", + object_class="TFAutoModel", + object_files="model checkpoint", ) @@ -1801,7 +1868,9 @@ def __init__(self, nf, nx, initializer_range=0.02, **kwargs): def build(self, input_shape): self.weight = self.add_weight( - "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range) + "weight", + shape=[self.nx, self.nf], + initializer=get_initializer(self.initializer_range), ) self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer()) @@ -1839,7 +1908,7 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optiona super().__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size - self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range + self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range def build(self, input_shape): """ @@ -1847,7 +1916,9 @@ def build(self, input_shape): https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ self.weight = self.add_weight( - "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range) + "weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) @@ -1961,7 +2032,9 @@ def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, ** else: num_classes = config.hidden_size self.summary = tf.keras.layers.Dense( - num_classes, kernel_initializer=get_initializer(initializer_range), name="summary" + num_classes, + kernel_initializer=get_initializer(initializer_range), + name="summary", ) self.has_activation = False @@ -2056,7 +2129,9 @@ def register_for_auto_class(cls, auto_class="TFAutoModel"): cls._auto_class = auto_class -def get_initializer(initializer_range: float = 0.02) -> tf.initializers.TruncatedNormal: +def get_initializer( + initializer_range: float = 0.02, +) -> tf.initializers.TruncatedNormal: """ Creates a `tf.initializers.TruncatedNormal` with the given range. diff --git a/src/transformers/models/convnext/configuration_convnext.py b/src/transformers/models/convnext/configuration_convnext.py index c09a54e86a7e2..74067ad337bbf 100644 --- a/src/transformers/models/convnext/configuration_convnext.py +++ b/src/transformers/models/convnext/configuration_convnext.py @@ -101,4 +101,3 @@ def __init__( self.layer_scale_init_value = layer_scale_init_value self.drop_path_rate = drop_path_rate self.image_size = image_size - self.output_attentions = None From 78198505f19c1abce0361ca972af95cb72574271 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 25 Feb 2022 17:17:28 +0530 Subject: [PATCH 61/65] chore: revert to the earlier tf utils. --- src/transformers/modeling_tf_utils.py | 121 +++++--------------------- 1 file changed, 23 insertions(+), 98 deletions(-) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 9d392ec6e4ff0..2ab3e79381171 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -315,7 +315,6 @@ def booleans_processing(config, **kwargs): final_booleans["output_attentions"] = kwargs.get("output_attentions", None) if final_booleans["output_attentions"] is None: final_booleans["output_attentions"] = config.output_attentions - final_booleans["output_hidden_states"] = ( kwargs["output_hidden_states"] if kwargs["output_hidden_states"] is not None @@ -367,17 +366,7 @@ def input_processing(func, config, input_ids, **kwargs): signature.pop("self", None) parameter_names = list(signature.keys()) output = {} - allowed_types = ( - tf.Tensor, - bool, - int, - ModelOutput, - tuple, - list, - dict, - np.ndarray, - KerasTensor, - ) + allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray, KerasTensor) if "inputs" in kwargs["kwargs_call"]: warnings.warn( @@ -490,13 +479,7 @@ def input_processing(func, config, input_ids, **kwargs): boolean_dict = { k: v for k, v in output.items() - if k - in [ - "return_dict", - "output_attentions", - "output_hidden_states", - "use_cache", - ] + if k in ["return_dict", "output_attentions", "output_hidden_states", "use_cache"] } output.update( @@ -595,18 +578,11 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False, # If yes we reshape the weight from the H5 file accordingly to the current weight # If the two shapes are not compatible we raise an issue try: - array = np.reshape( - saved_weight_value, - K.int_shape(symbolic_weight), - ) + array = np.reshape(saved_weight_value, K.int_shape(symbolic_weight)) except ValueError as e: if ignore_mismatched_sizes: mismatched_layers.append( - ( - symbolic_weight_name, - saved_weight_value.shape, - K.int_shape(symbolic_weight), - ) + (symbolic_weight_name, saved_weight_value.shape, K.int_shape(symbolic_weight)) ) continue else: @@ -650,17 +626,11 @@ def init_copy_embeddings(old_embeddings, new_num_tokens): # and we create a mask to properly identify the padded values and be replaced by the values of the newly created # embeddings current_weights = tf.pad( - old_embeddings.value(), - tf.convert_to_tensor([[0, size_diff], [0, 0]]), - constant_values=-1, + old_embeddings.value(), tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=-1 ) num_tokens_to_copy = min(old_num_tokens, new_num_tokens) mask = tf.fill(tf.convert_to_tensor([num_tokens_to_copy, 1]), True) - mask = tf.pad( - mask, - tf.convert_to_tensor([[0, size_diff], [0, 0]]), - constant_values=False, - ) + mask = tf.pad(mask, tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=False) else: # if the new size if lower than the old one, we take the current embeddings until the new size current_weights = tf.slice( @@ -805,10 +775,7 @@ def _save_checkpoint(self, checkpoint_dir, epoch): # internally and which users are likely to use too weights_path = os.path.join(checkpoint_dir, "weights.h5") self.save_weights(weights_path) - extra_data = { - "epoch": epoch, - "optimizer_state": self.optimizer.get_weights(), - } + extra_data = {"epoch": epoch, "optimizer_state": self.optimizer.get_weights()} extra_data_path = os.path.join(checkpoint_dir, "extra_data.pickle") with open(extra_data_path, "wb") as f: pickle.dump(extra_data, f) @@ -834,10 +801,7 @@ def load_repo_checkpoint(self, repo_path_or_name): if not os.path.isdir(repo_path_or_name): # If this isn't a local path, check that the remote repo exists and has a checkpoint in it repo_files = list_repo_files(repo_path_or_name) - for file in ( - "checkpoint/weights.h5", - "checkpoint/extra_data.pickle", - ): + for file in ("checkpoint/weights.h5", "checkpoint/extra_data.pickle"): if file not in repo_files: raise FileNotFoundError(f"Repo {repo_path_or_name} does not contain checkpoint file {file}!") if "/" not in repo_path_or_name: @@ -845,10 +809,7 @@ def load_repo_checkpoint(self, repo_path_or_name): repo_path_or_name = self.get_full_repo_name(repo_path_or_name) else: model_id = repo_path_or_name.split("/")[-1] - repo = Repository( - model_id, - clone_from=f"https://huggingface.co/{repo_path_or_name}", - ) + repo = Repository(model_id, clone_from=f"https://huggingface.co/{repo_path_or_name}") local_dir = repo.local_dir else: local_dir = repo_path_or_name @@ -1105,8 +1066,7 @@ def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]: `tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model. """ warnings.warn( - "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", - FutureWarning, + "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning ) return self.get_lm_head() @@ -1117,10 +1077,7 @@ def get_prefix_bias_name(self) -> Union[None, str]: Return: `str`: The _prefix name of the bias. """ - warnings.warn( - "The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", - FutureWarning, - ) + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) return None def get_bias(self) -> Union[None, Dict[str, tf.Variable]]: @@ -1268,25 +1225,15 @@ def _get_resized_lm_head_bias(self, old_lm_head_bias, new_num_tokens): # initialize new bias if tf.math.greater(size_diff, 0): padding_shape = [[0, size_diff]] if first_dim is None else [[0, 0], [0, size_diff]] - current_bias = tf.pad( - weight.value(), - tf.convert_to_tensor(padding_shape), - constant_values=-1, - ) + current_bias = tf.pad(weight.value(), tf.convert_to_tensor(padding_shape), constant_values=-1) num_tokens_to_copy = min(old_num_tokens, new_num_tokens) mask_shape = [num_tokens_to_copy] if first_dim is None else [1, num_tokens_to_copy] bias_mask = tf.fill(tf.convert_to_tensor(mask_shape), True) - bias_mask = tf.pad( - bias_mask, - tf.convert_to_tensor(padding_shape), - constant_values=False, - ) + bias_mask = tf.pad(bias_mask, tf.convert_to_tensor(padding_shape), constant_values=False) else: slice_from = [0] if first_dim is None else [0, 0] current_bias = tf.slice( - weight.value(), - tf.convert_to_tensor(slice_from), - tf.convert_to_tensor(final_shape), + weight.value(), tf.convert_to_tensor(slice_from), tf.convert_to_tensor(final_shape) ) bias_mask = tf.fill(tf.convert_to_tensor(final_shape), True) @@ -1427,11 +1374,7 @@ def save_pretrained(self, save_directory, saved_model=False, version=1, push_to_ if saved_model: saved_model_dir = os.path.join(save_directory, "saved_model", str(version)) - self.save( - saved_model_dir, - include_optimizer=False, - signatures=self.serving, - ) + self.save(saved_model_dir, include_optimizer=False, signatures=self.serving) logger.info(f"Saved model created in {saved_model_dir}") # Save configuration file @@ -1583,11 +1526,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): from_pipeline = kwargs.pop("_from_pipeline", None) from_auto_class = kwargs.pop("_from_auto", False) - user_agent = { - "file_type": "model", - "framework": "tensorflow", - "from_auto_class": from_auto_class, - } + user_agent = {"file_type": "model", "framework": "tensorflow", "from_auto_class": from_auto_class} if from_pipeline is not None: user_agent["using_pipeline"] = from_pipeline @@ -1683,11 +1622,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): "proxies": proxies, "use_auth_token": use_auth_token, } - if has_file( - pretrained_model_name_or_path, - WEIGHTS_NAME, - **has_file_kwargs, - ): + if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs): raise EnvironmentError( f"{pretrained_model_name_or_path} does not appear to have a file named {TF2_WEIGHTS_NAME} " "but there is a file for PyTorch weights. Use `from_pt=True` to load this model from " @@ -1837,9 +1772,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): # To update the docstring, we need to copy the method, otherwise we change the original docstring. TFPreTrainedModel.push_to_hub = copy_func(TFPreTrainedModel.push_to_hub) TFPreTrainedModel.push_to_hub.__doc__ = TFPreTrainedModel.push_to_hub.__doc__.format( - object="model", - object_class="TFAutoModel", - object_files="model checkpoint", + object="model", object_class="TFAutoModel", object_files="model checkpoint" ) @@ -1868,9 +1801,7 @@ def __init__(self, nf, nx, initializer_range=0.02, **kwargs): def build(self, input_shape): self.weight = self.add_weight( - "weight", - shape=[self.nx, self.nf], - initializer=get_initializer(self.initializer_range), + "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range) ) self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer()) @@ -1908,7 +1839,7 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optiona super().__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size - self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range + self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range def build(self, input_shape): """ @@ -1916,9 +1847,7 @@ def build(self, input_shape): https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ self.weight = self.add_weight( - "weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), + "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range) ) super().build(input_shape) @@ -2032,9 +1961,7 @@ def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, ** else: num_classes = config.hidden_size self.summary = tf.keras.layers.Dense( - num_classes, - kernel_initializer=get_initializer(initializer_range), - name="summary", + num_classes, kernel_initializer=get_initializer(initializer_range), name="summary" ) self.has_activation = False @@ -2129,9 +2056,7 @@ def register_for_auto_class(cls, auto_class="TFAutoModel"): cls._auto_class = auto_class -def get_initializer( - initializer_range: float = 0.02, -) -> tf.initializers.TruncatedNormal: +def get_initializer(initializer_range: float = 0.02) -> tf.initializers.TruncatedNormal: """ Creates a `tf.initializers.TruncatedNormal` with the given range. From ba9484ff11c370859092d05f3013198691c03572 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 25 Feb 2022 17:59:37 +0530 Subject: [PATCH 62/65] fix: output shapes of the hidden states --- .../models/convnext/modeling_tf_convnext.py | 16 ++++++++++++---- tests/convnext/test_modeling_tf_convnext.py | 4 ++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 328194dddbc2c..f97b493bf30cd 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -292,7 +292,9 @@ def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwa self.embeddings = TFConvNextEmbeddings(config, name="embeddings") self.encoder = TFConvNextEncoder(config, name="encoder") self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") - self.pooler = tf.keras.layers.GlobalAvgPool2D() if add_pooling_layer else None + # We are setting the `data_format` like so because from here on we will revert to the + # NCHW output format + self.pooler = tf.keras.layers.GlobalAvgPool2D(data_format="channels_first") if add_pooling_layer else None def call( self, @@ -333,15 +335,21 @@ def call( ) last_hidden_state = encoder_outputs[0] + # Change to NCHW output format have uniformity in the modules + last_hidden_state = tf.transpose(last_hidden_state, perm=(0, 3, 1, 2)) pooled_output = self.layernorm(self.pooler(last_hidden_state)) + # Change the other hidden state outputs to NCHW as well + if output_hidden_states: + hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]]) + if not return_dict: return (last_hidden_state, pooled_output) + encoder_outputs[1:] return TFBaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, + hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, ) @@ -504,10 +512,10 @@ def call( # converts back NHWC -> NCHW, to match PT's output if not return_dict: - return (tf.transpose(outputs[0], perm=(0, 3, 1, 2)),) + outputs[1:] + return (outputs[0],) + outputs[1:] return TFBaseModelOutputWithPooling( - last_hidden_state=tf.transpose(outputs.last_hidden_state, perm=(0, 3, 1, 2)), + last_hidden_state=outputs.last_hidden_state, pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, ) diff --git a/tests/convnext/test_modeling_tf_convnext.py b/tests/convnext/test_modeling_tf_convnext.py index cfc2646176448..f5ccd1438af03 100644 --- a/tests/convnext/test_modeling_tf_convnext.py +++ b/tests/convnext/test_modeling_tf_convnext.py @@ -199,9 +199,9 @@ def check_hidden_states_output(inputs_dict, config, model_class): expected_num_stages = self.model_tester.num_stages self.assertEqual(len(hidden_states), expected_num_stages + 1) - # ConvNext's feature maps are of shape (batch_size, height, width, num_channels) in TF + # ConvNext's feature maps are of shape (batch_size, num_channels, height, width) self.assertListEqual( - list(hidden_states[0].shape[1:-1]), + list(hidden_states[0].shape[-2:]), [ self.model_tester.image_size // 4, self.model_tester.image_size // 4, From 553bac5342defc01ebc28f6947b9c85b8d0dcab5 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 25 Feb 2022 18:28:36 +0530 Subject: [PATCH 63/65] chore: removed unnecessary comment --- src/transformers/models/convnext/modeling_tf_convnext.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index f97b493bf30cd..c6e7f1311e532 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -510,7 +510,6 @@ def call( training=inputs["training"], ) - # converts back NHWC -> NCHW, to match PT's output if not return_dict: return (outputs[0],) + outputs[1:] From d22e0cbc8fbd07d8199f785c96f7598fd805f13f Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 25 Feb 2022 20:34:02 +0530 Subject: [PATCH 64/65] chore: reverting to the right test_modeling_tf_common.py. --- tests/test_modeling_tf_common.py | 395 ++++++------------------------- 1 file changed, 72 insertions(+), 323 deletions(-) diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 2038f29e56cf8..142bff7cae06e 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -83,8 +83,7 @@ # Restrict TensorFlow to only allocate x GB of memory on the GPUs try: tf.config.set_logical_device_configuration( - gpu, - [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)], + gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)] ) logical_gpus = tf.config.list_logical_devices("GPU") print("Logical GPUs", logical_gpus) @@ -117,10 +116,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): inputs_dict = { - k: tf.tile( - tf.expand_dims(v, 1), - (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1), - ) + k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1)) if isinstance(v, tf.Tensor) and v.ndim > 0 else v for k, v in inputs_dict.items() @@ -148,11 +144,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING), ]: inputs_dict["labels"] = tf.zeros( - ( - self.model_tester.batch_size, - self.model_tester.seq_length, - ), - dtype=tf.int32, + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32 ) return inputs_dict @@ -160,10 +152,7 @@ def test_initialization(self): pass def test_save_load(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -177,10 +166,7 @@ def test_save_load(self): self.assert_outputs_same(after_outputs, outputs) def test_save_load_config(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -232,10 +218,7 @@ def test_onnx_compliancy(self): if not self.test_onnx: return - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() INTERNAL_OPS = [ "Assert", "AssignVariableOp", @@ -282,10 +265,7 @@ def test_onnx_runtime_optimize(self): import onnxruntime import tf2onnx - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -296,10 +276,7 @@ def test_onnx_runtime_optimize(self): onnxruntime.InferenceSession(onnx_model_proto.SerializeToString()) def test_keras_save_load(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() tf_main_layer_classes = set( module_member @@ -344,8 +321,7 @@ def test_keras_save_load(self): ) else: model = tf.keras.models.load_model( - filepath, - custom_objects={main_layer_class.__name__: main_layer_class}, + filepath, custom_objects={main_layer_class.__name__: main_layer_class} ) assert isinstance(model, tf.keras.Model) after_outputs = model(inputs_dict) @@ -372,10 +348,7 @@ def test_pt_tf_model_equivalence(self): import transformers - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beginning @@ -388,9 +361,7 @@ def test_pt_tf_model_equivalence(self): # Check we can load pt model in tf and vice-versa with model => model functions tf_model = transformers.load_pytorch_model_in_tf2_model( - tf_model, - pt_model, - tf_inputs=self._prepare_for_class(inputs_dict, model_class), + tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class) ) pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) @@ -411,10 +382,7 @@ def test_pt_tf_model_equivalence(self): with torch.no_grad(): pto = pt_model(**pt_inputs_dict) - tfo = tf_model( - self._prepare_for_class(inputs_dict, model_class), - training=False, - ) + tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False) tf_hidden_states = tfo[0].numpy() pt_hidden_states = pto[0].numpy() @@ -473,20 +441,14 @@ def test_pt_tf_model_equivalence(self): self.assertLessEqual(max_diff, 4e-2) def test_compile_tf_model(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() max_input = getattr(self.model_tester, "max_position_embeddings", 512) optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") for model_class in self.all_model_classes: - if model_class.__name__ in [ - "TFSpeech2TextModel", - "TFSpeech2TextForConditionalGeneration", - ]: + if model_class.__name__ in ["TFSpeech2TextModel", "TFSpeech2TextForConditionalGeneration"]: inputs = { "decoder_input_ids": tf.keras.Input( batch_shape=(2, max_input), @@ -510,11 +472,7 @@ def test_compile_tf_model(self): name="decoder_input_ids", dtype="int32", ), - "input_ids": tf.keras.Input( - batch_shape=(2, max_input), - name="input_ids", - dtype="int32", - ), + "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"), } # `pixel_values` implies that the input is an image elif model_class.main_input_name == "pixel_values": @@ -530,11 +488,7 @@ def test_compile_tf_model(self): ) elif model_class.__name__ in ["TFCLIPModel"]: inputs = { - "input_ids": tf.keras.Input( - batch_shape=(3, max_input), - name="input_ids", - dtype="int32", - ), + "input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"), "pixel_values": tf.keras.Input( batch_shape=( 3, @@ -547,11 +501,7 @@ def test_compile_tf_model(self): ), } elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): - inputs = tf.keras.Input( - batch_shape=(4, 2, max_input), - name="input_ids", - dtype="int32", - ) + inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32") else: inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32") @@ -574,10 +524,7 @@ def test_compile_tf_model(self): extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) def test_keyword_and_dict_args(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -593,21 +540,10 @@ def test_keyword_and_dict_args(self): self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6) def test_attention_outputs(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True - decoder_seq_length = getattr( - self.model_tester, - "decoder_seq_length", - self.model_tester.seq_length, - ) - encoder_seq_length = getattr( - self.model_tester, - "encoder_seq_length", - self.model_tester.seq_length, - ) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length) + encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length) decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length) encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) @@ -618,11 +554,7 @@ def check_decoder_attentions_output(outputs): self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(decoder_attentions[0].shape[-3:]), - [ - self.model_tester.num_attention_heads, - decoder_seq_length, - decoder_key_length, - ], + [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], ) def check_encoder_attentions_output(outputs): @@ -632,11 +564,7 @@ def check_encoder_attentions_output(outputs): self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(attentions[0].shape[-3:]), - [ - self.model_tester.num_attention_heads, - encoder_seq_length, - encoder_key_length, - ], + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], ) for model_class in self.all_model_classes: @@ -678,10 +606,7 @@ def test_headmasking(self): return random.Random().seed(42) - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() random.Random().seed() inputs_dict["output_attentions"] = True @@ -694,19 +619,11 @@ def test_headmasking(self): def prepare_layer_head_mask(i, attention_heads, num_hidden_layers): if i == 0: return tf.concat( - ( - tf.zeros(1, dtype=tf.float32), - tf.ones(attention_heads - 1, dtype=tf.float32), - ), - 0, + (tf.zeros(1, dtype=tf.float32), tf.ones(attention_heads - 1, dtype=tf.float32)), 0 ) elif i == num_hidden_layers - 1: return tf.concat( - ( - tf.zeros(attention_heads - 1, dtype=tf.float32), - tf.ones(1, dtype=tf.float32), - ), - 0, + (tf.zeros(attention_heads - 1, dtype=tf.float32), tf.ones(1, dtype=tf.float32)), 0 ) else: return tf.ones(attention_heads, dtype=tf.float32) @@ -735,8 +652,7 @@ def check_attentions_validity(attentions): # Remove Nan for t in attentions: self.assertLess( - (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), - (tf.size(t) / 4).numpy(), + (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), (tf.size(t) / 4).numpy() ) # Check we don't have more than 25% nans (arbitrary) attentions = [ @@ -744,23 +660,11 @@ def check_attentions_validity(attentions): ] # remove them (the test is less complete) self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0) - self.assertNotEqual( - tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), - 0.0, - ) + self.assertNotEqual(tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), 0.0) if len(attentions) > 2: # encoder-decodere models have only 2 layers in each modules - self.assertNotEqual( - tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), - 0.0, - ) - self.assertAlmostEqual( - tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), - 0.0, - ) - self.assertNotEqual( - tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), - 0.0, - ) + self.assertNotEqual(tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), 0.0) + self.assertAlmostEqual(tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), 0.0) + self.assertNotEqual(tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), 0.0) if model.config.is_encoder_decoder: check_attentions_validity(outputs.encoder_attentions) @@ -771,18 +675,13 @@ def check_attentions_validity(attentions): check_attentions_validity(outputs.attentions) def test_hidden_states_output(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def check_hidden_states_output(config, inputs_dict, model_class): model = model_class(config) outputs = model(self._prepare_for_class(inputs_dict, model_class)) expected_num_layers = getattr( - self.model_tester, - "expected_num_hidden_layers", - self.model_tester.num_hidden_layers + 1, + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 ) if model.config.is_encoder_decoder: @@ -793,18 +692,12 @@ def check_hidden_states_output(config, inputs_dict, model_class): self.assertEqual(len(encoder_hidden_states), expected_num_layers) self.assertListEqual( list(encoder_hidden_states[0].shape[-2:]), - [ - self.model_tester.seq_length, - self.model_tester.hidden_size, - ], + [self.model_tester.seq_length, self.model_tester.hidden_size], ) self.assertEqual(len(decoder_hidden_states), expected_num_layers) self.assertListEqual( list(decoder_hidden_states[0].shape[-2:]), - [ - self.model_tester.seq_length, - self.model_tester.hidden_size, - ], + [self.model_tester.seq_length, self.model_tester.hidden_size], ) else: hidden_states = outputs.hidden_states @@ -812,10 +705,7 @@ def check_hidden_states_output(config, inputs_dict, model_class): self.assertEqual(len(hidden_states), expected_num_layers) self.assertListEqual( list(hidden_states[0].shape[-2:]), - [ - self.model_tester.seq_length, - self.model_tester.hidden_size, - ], + [self.model_tester.seq_length, self.model_tester.hidden_size], ) for model_class in self.all_model_classes: @@ -827,10 +717,7 @@ def check_hidden_states_output(config, inputs_dict, model_class): check_hidden_states_output(config, inputs_dict, model_class) def test_model_common_attributes(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() text_in_text_out_models = ( get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING) + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING) @@ -860,22 +747,13 @@ def test_model_common_attributes(self): assert name is None def test_determinism(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) first, second = ( - model( - self._prepare_for_class(inputs_dict, model_class), - training=False, - )[0], - model( - self._prepare_for_class(inputs_dict, model_class), - training=False, - )[0], + model(self._prepare_for_class(inputs_dict, model_class), training=False)[0], + model(self._prepare_for_class(inputs_dict, model_class), training=False)[0], ) out_1 = first.numpy() out_2 = second.numpy() @@ -886,10 +764,7 @@ def test_determinism(self): def test_model_outputs_equivalence(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs) @@ -939,17 +814,11 @@ def recursive_check(tuple_object, dict_object): tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence( - model, - tuple_inputs, - dict_inputs, - {"output_hidden_states": True, "output_attentions": True}, + model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} ) def test_inputs_embeds(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) @@ -976,10 +845,7 @@ def test_inputs_embeds(self): model(inputs) def test_numpy_arrays_inputs(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def prepare_numpy_arrays(inputs_dict): inputs_np_dict = {} @@ -1004,10 +870,7 @@ def prepare_numpy_arrays(inputs_dict): def test_resize_token_embeddings(self): if not self.test_resize_embeddings: return - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def _get_word_embedding_weight(model, embedding_layer): embeds = getattr(embedding_layer, "weight", None) @@ -1066,25 +929,16 @@ def _get_word_embedding_weight(model, embedding_layer): if old_output_embeddings is not None and new_output_embeddings is not None: self.assertEqual(new_output_embeddings.shape[0], assert_size) - self.assertEqual( - new_output_embeddings.shape[1], - old_output_embeddings.shape[1], - ) + self.assertEqual(new_output_embeddings.shape[1], old_output_embeddings.shape[1]) models_equal = True - for p1, p2 in zip( - old_output_embeddings.value(), - new_output_embeddings.value(), - ): + for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()): if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0: models_equal = False self.assertTrue(models_equal) def test_lm_head_model_random_no_beam_search_generate(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) # iterate over all generative models @@ -1111,25 +965,16 @@ def test_lm_head_model_random_no_beam_search_generate(self): # check bad words tokens language generation # create list of 1-seq bad token and list of 2-seq of bad tokens - bad_words_ids = [ - self._generate_random_bad_tokens(1, model), - self._generate_random_bad_tokens(2, model), - ] + bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)] output_tokens = model.generate( - input_ids, - do_sample=True, - bad_words_ids=bad_words_ids, - num_return_sequences=2, + input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2 ) # only count generated tokens generated_ids = output_tokens[:, input_ids.shape[-1] :] self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) def test_lm_head_model_no_beam_search_generate_dict_outputs(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) if input_ids is None: input_ids = inputs_dict.get("input_features", None) @@ -1162,10 +1007,7 @@ def test_lm_head_model_no_beam_search_generate_dict_outputs(self): self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput) def test_lm_head_model_random_beam_search_generate(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) for model_class in self.all_generative_model_classes: @@ -1180,12 +1022,7 @@ def test_lm_head_model_random_beam_search_generate(self): with self.assertRaises(AssertionError): # generating more sequences than having beams leads is not possible - model.generate( - input_ids, - do_sample=False, - num_return_sequences=3, - num_beams=2, - ) + model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2) # num_return_sequences > 1, sample self._check_generated_ids( @@ -1197,37 +1034,20 @@ def test_lm_head_model_random_beam_search_generate(self): ) ) # num_return_sequences > 1, greedy - self._check_generated_ids( - model.generate( - input_ids, - do_sample=False, - num_beams=2, - num_return_sequences=2, - ) - ) + self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2)) # check bad words tokens language generation # create list of 1-seq bad token and list of 2-seq of bad tokens - bad_words_ids = [ - self._generate_random_bad_tokens(1, model), - self._generate_random_bad_tokens(2, model), - ] + bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)] output_tokens = model.generate( - input_ids, - do_sample=False, - bad_words_ids=bad_words_ids, - num_beams=2, - num_return_sequences=2, + input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2 ) # only count generated tokens generated_ids = output_tokens[:, input_ids.shape[-1] :] self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) def test_lm_head_model_beam_search_generate_dict_outputs(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() input_ids = inputs_dict.get("input_ids", None) if input_ids is None: input_ids = inputs_dict.get("input_features", None) @@ -1262,20 +1082,14 @@ def test_lm_head_model_beam_search_generate_dict_outputs(self): self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput) def test_loss_computation(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) if getattr(model, "hf_compute_loss", None): # The number of elements in the loss should be the same as the number of elements in the label prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) added_label = prepared_for_class[ - sorted( - list(prepared_for_class.keys() - inputs_dict.keys()), - reverse=True, - )[0] + sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0] ] loss_size = tf.size(added_label) @@ -1286,11 +1100,7 @@ def test_loss_computation(self): # Test that model correctly compute the loss with kwargs prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) - possible_input_names = { - "input_ids", - "pixel_values", - "input_features", - } + possible_input_names = {"input_ids", "pixel_values", "input_features"} input_name = possible_input_names.intersection(set(prepared_for_class)).pop() model_input = prepared_for_class.pop(input_name) @@ -1334,15 +1144,8 @@ def test_loss_computation(self): self.assertEqual(loss.shape, [loss_size]) def test_generate_with_headmasking(self): - attention_names = [ - "encoder_attentions", - "decoder_attentions", - "cross_attentions", - ] - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"] + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_generative_model_classes: model = model_class(config) @@ -1377,10 +1180,7 @@ def test_generate_with_headmasking(self): def test_load_with_mismatched_shapes(self): if not self.test_mismatched_shapes: return - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): @@ -1487,13 +1287,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): def random_attention_mask(shape, rng=None, name=None, dtype=None): attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype) # make sure that at least one token is attended to for each batch - attn_mask = tf.concat( - [ - tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), - attn_mask[:, 1:], - ], - axis=1, - ) + attn_mask = tf.concat([tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), attn_mask[:, 1:]], axis=1) return attn_mask @@ -1510,10 +1304,7 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None): for _ in range(total_dims): values.append(rng.random() * scale) - return tf.reshape( - tf.constant(values, dtype=dtype if dtype is not None else tf.float32), - shape=shape, - ) + return tf.reshape(tf.constant(values, dtype=dtype if dtype is not None else tf.float32), shape=shape) @require_tf @@ -1592,34 +1383,12 @@ def test_top_k_top_p_filtering(self): ) non_inf_expected_idx = tf.convert_to_tensor( - [ - [0, 0], - [0, 9], - [0, 10], - [0, 25], - [0, 26], - [1, 13], - [1, 17], - [1, 18], - [1, 20], - [1, 27], - ], + [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]], dtype=tf.int32, ) # expected non filtered idx as noted above non_inf_expected_output = tf.convert_to_tensor( - [ - 8.222099, - 7.3534126, - 8.432078, - 7.4402075, - 9.38451, - 6.271159, - 8.827531, - 5.4402995, - 7.3857956, - 9.677023, - ], + [8.222099, 7.3534126, 8.432078, 7.4402075, 9.38451, 6.271159, 8.827531, 5.4402995, 7.3857956, 9.677023], dtype=tf.float32, ) # expected non filtered values as noted above @@ -1650,31 +1419,19 @@ def tearDownClass(cls): pass try: - delete_repo( - token=cls._token, - name="test-model-tf-org", - organization="valid_org", - ) + delete_repo(token=cls._token, name="test-model-tf-org", organization="valid_org") except HTTPError: pass def test_push_to_hub(self): config = BertConfig( - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, + vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 ) model = TFBertModel(config) # Make sure model is properly initialized _ = model(model.dummy_inputs) with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained( - os.path.join(tmp_dir, "test-model-tf"), - push_to_hub=True, - use_auth_token=self._token, - ) + model.save_pretrained(os.path.join(tmp_dir, "test-model-tf"), push_to_hub=True, use_auth_token=self._token) new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf") models_equal = True @@ -1685,11 +1442,7 @@ def test_push_to_hub(self): def test_push_to_hub_with_model_card(self): config = BertConfig( - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, + vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 ) model = TFBertModel(config) with tempfile.TemporaryDirectory() as tmp_dir: @@ -1698,11 +1451,7 @@ def test_push_to_hub_with_model_card(self): def test_push_to_hub_in_organization(self): config = BertConfig( - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, + vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 ) model = TFBertModel(config) with tempfile.TemporaryDirectory() as tmp_dir: From de00fb288a07d64a7e15b657a4d0a343a2d38602 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Fri, 25 Feb 2022 11:36:25 -0500 Subject: [PATCH 65/65] Styling nits --- .../models/convnext/modeling_tf_convnext.py | 15 ++----- tests/convnext/test_modeling_tf_convnext.py | 45 +++---------------- 2 files changed, 10 insertions(+), 50 deletions(-) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index c6e7f1311e532..fbb436059340f 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -141,15 +141,9 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs): # Using `layers.Activation` instead of `tf.identity` to better control `training` # behaviour. self.drop_path = ( - TFConvNextDropPath( - drop_path, - name="drop_path", - ) + TFConvNextDropPath(drop_path, name="drop_path") if drop_path > 0.0 - else tf.keras.layers.Activation( - "linear", - name="drop_path", - ) + else tf.keras.layers.Activation("linear", name="drop_path") ) def build(self, input_shape: tf.TensorShape): @@ -275,10 +269,7 @@ def call(self, hidden_states, output_hidden_states=False, return_dict=True): if not return_dict: return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) - return TFBaseModelOutput( - last_hidden_state=hidden_states, - hidden_states=all_hidden_states, - ) + return TFBaseModelOutput(last_hidden_state=hidden_states, hidden_states=all_hidden_states) @keras_serializable diff --git a/tests/convnext/test_modeling_tf_convnext.py b/tests/convnext/test_modeling_tf_convnext.py index f5ccd1438af03..880e006f1abf2 100644 --- a/tests/convnext/test_modeling_tf_convnext.py +++ b/tests/convnext/test_modeling_tf_convnext.py @@ -73,14 +73,7 @@ def __init__( self.scope = scope def prepare_config_and_inputs(self): - pixel_values = floats_tensor( - [ - self.batch_size, - self.num_channels, - self.image_size, - self.image_size, - ] - ) + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) labels = None if self.use_labels: @@ -107,22 +100,14 @@ def create_and_check_model(self, config, pixel_values, labels): # expected last hidden states: B, C, H // 32, W // 32 self.parent.assertEqual( result.last_hidden_state.shape, - ( - self.batch_size, - self.hidden_sizes[-1], - self.image_size // 32, - self.image_size // 32, - ), + (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32), ) def create_and_check_for_image_classification(self, config, pixel_values, labels): config.num_labels = self.type_sequence_label_size model = TFConvNextForImageClassification(config) result = model(pixel_values, labels=labels, training=False) - self.parent.assertEqual( - result.logits.shape, - (self.batch_size, self.type_sequence_label_size), - ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() @@ -138,14 +123,7 @@ class TFConvNextModelTest(TFModelTesterMixin, unittest.TestCase): attention_mask and seq_length. """ - all_model_classes = ( - ( - TFConvNextModel, - TFConvNextForImageClassification, - ) - if is_tf_available() - else () - ) + all_model_classes = (TFConvNextModel, TFConvNextForImageClassification) if is_tf_available() else () test_pruning = False test_onnx = False @@ -202,16 +180,10 @@ def check_hidden_states_output(inputs_dict, config, model_class): # ConvNext's feature maps are of shape (batch_size, num_channels, height, width) self.assertListEqual( list(hidden_states[0].shape[-2:]), - [ - self.model_tester.image_size // 4, - self.model_tester.image_size // 4, - ], + [self.model_tester.image_size // 4, self.model_tester.image_size // 4], ) - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True @@ -225,10 +197,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): # Since ConvNext does not have any attention we need to rewrite this test. def test_model_outputs_equivalence(self): - ( - config, - inputs_dict, - ) = self.model_tester.prepare_config_and_inputs_for_common() + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)