diff --git a/docs/source/en/model_doc/deformable_detr.mdx b/docs/source/en/model_doc/deformable_detr.mdx index 7997b2f19d2e82..08cbbb0aa02189 100644 --- a/docs/source/en/model_doc/deformable_detr.mdx +++ b/docs/source/en/model_doc/deformable_detr.mdx @@ -33,6 +33,16 @@ alt="drawing" width="600"/> This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/fundamentalvision/Deformable-DETR). +## DeformableDetrFeatureExtractor + +[[autodoc]] DeformableDetrFeatureExtractor + - __call__ + - pad_and_create_pixel_mask + - post_process + - post_process_segmentation + - post_process_panoptic + + ## DeformableDetrConfig [[autodoc]] DeformableDetrConfig diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 6abc53c85008e3..123c56bdd53267 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -659,6 +659,7 @@ _import_structure["models.beit"].append("BeitFeatureExtractor") _import_structure["models.clip"].append("CLIPFeatureExtractor") _import_structure["models.convnext"].append("ConvNextFeatureExtractor") + _import_structure["models.deformable_detr"].append("DeformableDetrFeatureExtractor") _import_structure["models.deit"].append("DeiTFeatureExtractor") _import_structure["models.detr"].append("DetrFeatureExtractor") _import_structure["models.conditional_detr"].append("ConditionalDetrFeatureExtractor") @@ -3512,6 +3513,7 @@ from .models.clip import CLIPFeatureExtractor from .models.conditional_detr import ConditionalDetrFeatureExtractor from .models.convnext import ConvNextFeatureExtractor + from .models.deformable_detr import DeformableDetrFeatureExtractor from .models.deit import DeiTFeatureExtractor from .models.detr import DetrFeatureExtractor from .models.donut import DonutFeatureExtractor diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index cb75f439c233d3..3f2265875f2a69 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -44,7 +44,7 @@ ("cvt", "ConvNextFeatureExtractor"), ("data2vec-audio", "Wav2Vec2FeatureExtractor"), ("data2vec-vision", "BeitFeatureExtractor"), - ("deformable_detr", "DetrFeatureExtractor"), + ("deformable_detr", "DeformableDetrFeatureExtractor"), ("deit", "DeiTFeatureExtractor"), ("detr", "DetrFeatureExtractor"), ("donut", "DonutFeatureExtractor"), diff --git a/src/transformers/models/deformable_detr/__init__.py b/src/transformers/models/deformable_detr/__init__.py index f70d937c7ff468..36c1a83b0eee4e 100644 --- a/src/transformers/models/deformable_detr/__init__.py +++ b/src/transformers/models/deformable_detr/__init__.py @@ -18,13 +18,21 @@ from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_timm_available +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_timm_available, is_vision_available _import_structure = { "configuration_deformable_detr": ["DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeformableDetrConfig"], } +try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["feature_extraction_deformable_detr"] = ["DeformableDetrFeatureExtractor"] + try: if not is_timm_available(): raise OptionalDependencyNotAvailable() @@ -42,6 +50,14 @@ if TYPE_CHECKING: from .configuration_deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .feature_extraction_deformable_detr import DeformableDetrFeatureExtractor + try: if not is_timm_available(): raise OptionalDependencyNotAvailable() diff --git a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py index 85ee723fb33968..30726c5e9744c6 100644 --- a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py +++ b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py @@ -24,7 +24,7 @@ import requests from huggingface_hub import cached_download, hf_hub_url -from transformers import DeformableDetrConfig, DeformableDetrForObjectDetection, DetrFeatureExtractor +from transformers import DeformableDetrConfig, DeformableDetrFeatureExtractor, DeformableDetrForObjectDetection from transformers.utils import logging @@ -116,7 +116,7 @@ def convert_deformable_detr_checkpoint( config.label2id = {v: k for k, v in id2label.items()} # load feature extractor - feature_extractor = DetrFeatureExtractor(format="coco_detection") + feature_extractor = DeformableDetrFeatureExtractor(format="coco_detection") # prepare image img = prepare_img() diff --git a/src/transformers/models/deformable_detr/feature_extraction_deformable_detr.py b/src/transformers/models/deformable_detr/feature_extraction_deformable_detr.py new file mode 100644 index 00000000000000..415df84fd196b8 --- /dev/null +++ b/src/transformers/models/deformable_detr/feature_extraction_deformable_detr.py @@ -0,0 +1,946 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for Deformable DETR.""" + +import io +import pathlib +from collections import defaultdict +from typing import Dict, List, Optional, Union + +import numpy as np +from PIL import Image + +from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin +from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor +from ...utils import TensorType, is_torch_available, logging + + +if is_torch_available(): + import torch + from torch import nn + +logger = logging.get_logger(__name__) + + +ImageInput = Union[Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"]] + + +# Copied from transformers.models.detr.feature_extraction_detr.center_to_corners_format +def center_to_corners_format(x): + """ + Converts a PyTorch tensor of bounding boxes of center format (center_x, center_y, width, height) to corners format + (x_0, y_0, x_1, y_1). + """ + x_c, y_c, w, h = x.unbind(-1) + b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] + return torch.stack(b, dim=-1) + + +# Copied from transformers.models.detr.feature_extraction_detr.corners_to_center_format +def corners_to_center_format(x): + """ + Converts a NumPy array of bounding boxes of shape (number of bounding boxes, 4) of corners format (x_0, y_0, x_1, + y_1) to center format (center_x, center_y, width, height). + """ + x_transposed = x.T + x0, y0, x1, y1 = x_transposed[0], x_transposed[1], x_transposed[2], x_transposed[3] + b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)] + return np.stack(b, axis=-1) + + +# Copied from transformers.models.detr.feature_extraction_detr.masks_to_boxes +def masks_to_boxes(masks): + """ + Compute the bounding boxes around the provided panoptic segmentation masks. + + The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. + + Returns a [N, 4] tensor, with the boxes in corner (xyxy) format. + """ + if masks.size == 0: + return np.zeros((0, 4)) + + h, w = masks.shape[-2:] + + y = np.arange(0, h, dtype=np.float32) + x = np.arange(0, w, dtype=np.float32) + # see https://github.com/pytorch/pytorch/issues/50276 + y, x = np.meshgrid(y, x, indexing="ij") + + x_mask = masks * np.expand_dims(x, axis=0) + x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1) + x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool))) + x_min = x.filled(fill_value=1e8) + x_min = x_min.reshape(x_min.shape[0], -1).min(-1) + + y_mask = masks * np.expand_dims(y, axis=0) + y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1) + y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool))) + y_min = y.filled(fill_value=1e8) + y_min = y_min.reshape(y_min.shape[0], -1).min(-1) + + return np.stack([x_min, y_min, x_max, y_max], 1) + + +# Copied from transformers.models.detr.feature_extraction_detr.rgb_to_id +def rgb_to_id(color): + if isinstance(color, np.ndarray) and len(color.shape) == 3: + if color.dtype == np.uint8: + color = color.astype(np.int32) + return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2] + return int(color[0] + 256 * color[1] + 256 * 256 * color[2]) + + +# Copied from transformers.models.detr.feature_extraction_detr.id_to_rgb +def id_to_rgb(id_map): + if isinstance(id_map, np.ndarray): + id_map_copy = id_map.copy() + rgb_shape = tuple(list(id_map.shape) + [3]) + rgb_map = np.zeros(rgb_shape, dtype=np.uint8) + for i in range(3): + rgb_map[..., i] = id_map_copy % 256 + id_map_copy //= 256 + return rgb_map + color = [] + for _ in range(3): + color.append(id_map % 256) + id_map //= 256 + return color + + +class DeformableDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): + r""" + Constructs a Deformable DETR feature extractor. Differs only in the postprocessing of object detection compared to + DETR. + + This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users + should refer to this superclass for more information regarding those methods. + + Args: + format (`str`, *optional*, defaults to `"coco_detection"`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the input to a certain `size`. + size (`int`, *optional*, defaults to 800): + Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a + sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of + the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size * + height / width, size)`. + max_size (`int`, *optional*, defaults to 1333): + The largest size an image dimension can have (otherwise it's capped). Only has an effect if `do_resize` is + set to `True`. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether or not to normalize the input with mean and standard deviation. + image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`): + The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean. + image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`): + The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the + ImageNet std. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.__init__ + def __init__( + self, + format="coco_detection", + do_resize=True, + size=800, + max_size=1333, + do_normalize=True, + image_mean=None, + image_std=None, + **kwargs + ): + super().__init__(**kwargs) + self.format = self._is_valid_format(format) + self.do_resize = do_resize + self.size = size + self.max_size = max_size + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else [0.485, 0.456, 0.406] # ImageNet mean + self.image_std = image_std if image_std is not None else [0.229, 0.224, 0.225] # ImageNet std + + # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._is_valid_format + def _is_valid_format(self, format): + if format not in ["coco_detection", "coco_panoptic"]: + raise ValueError(f"Format {format} not supported") + return format + + # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare + def prepare(self, image, target, return_segmentation_masks=False, masks_path=None): + if self.format == "coco_detection": + image, target = self.prepare_coco_detection(image, target, return_segmentation_masks) + return image, target + elif self.format == "coco_panoptic": + image, target = self.prepare_coco_panoptic(image, target, masks_path) + return image, target + else: + raise ValueError(f"Format {self.format} not supported") + + # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.convert_coco_poly_to_mask + def convert_coco_poly_to_mask(self, segmentations, height, width): + + try: + from pycocotools import mask as coco_mask + except ImportError: + raise ImportError("Pycocotools is not installed in your environment.") + + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = np.asarray(mask, dtype=np.uint8) + mask = np.any(mask, axis=2) + masks.append(mask) + if masks: + masks = np.stack(masks, axis=0) + else: + masks = np.zeros((0, height, width), dtype=np.uint8) + + return masks + + # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_detection + def prepare_coco_detection(self, image, target, return_segmentation_masks=False): + """ + Convert the target in COCO format into the format expected by DETR. + """ + w, h = image.size + + image_id = target["image_id"] + image_id = np.asarray([image_id], dtype=np.int64) + + # get all COCO annotations for the given image + anno = target["annotations"] + + anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0] + + boxes = [obj["bbox"] for obj in anno] + # guard against no boxes via resizing + boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=w) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=h) + + classes = [obj["category_id"] for obj in anno] + classes = np.asarray(classes, dtype=np.int64) + + if return_segmentation_masks: + segmentations = [obj["segmentation"] for obj in anno] + masks = self.convert_coco_poly_to_mask(segmentations, h, w) + + keypoints = None + if anno and "keypoints" in anno[0]: + keypoints = [obj["keypoints"] for obj in anno] + keypoints = np.asarray(keypoints, dtype=np.float32) + num_keypoints = keypoints.shape[0] + if num_keypoints: + keypoints = keypoints.reshape((-1, 3)) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + boxes = boxes[keep] + classes = classes[keep] + if return_segmentation_masks: + masks = masks[keep] + if keypoints is not None: + keypoints = keypoints[keep] + + target = {} + target["boxes"] = boxes + target["class_labels"] = classes + if return_segmentation_masks: + target["masks"] = masks + target["image_id"] = image_id + if keypoints is not None: + target["keypoints"] = keypoints + + # for conversion to coco api + area = np.asarray([obj["area"] for obj in anno], dtype=np.float32) + iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno], dtype=np.int64) + target["area"] = area[keep] + target["iscrowd"] = iscrowd[keep] + + target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64) + target["size"] = np.asarray([int(h), int(w)], dtype=np.int64) + + return image, target + + # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_panoptic + def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True): + w, h = image.size + ann_info = target.copy() + ann_path = pathlib.Path(masks_path) / ann_info["file_name"] + + if "segments_info" in ann_info: + masks = np.asarray(Image.open(ann_path), dtype=np.uint32) + masks = rgb_to_id(masks) + + ids = np.array([ann["id"] for ann in ann_info["segments_info"]]) + masks = masks == ids[:, None, None] + masks = np.asarray(masks, dtype=np.uint8) + + labels = np.asarray([ann["category_id"] for ann in ann_info["segments_info"]], dtype=np.int64) + + target = {} + target["image_id"] = np.asarray( + [ann_info["image_id"] if "image_id" in ann_info else ann_info["id"]], dtype=np.int64 + ) + if return_masks: + target["masks"] = masks + target["class_labels"] = labels + + target["boxes"] = masks_to_boxes(masks) + + target["size"] = np.asarray([int(h), int(w)], dtype=np.int64) + target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64) + if "segments_info" in ann_info: + target["iscrowd"] = np.asarray([ann["iscrowd"] for ann in ann_info["segments_info"]], dtype=np.int64) + target["area"] = np.asarray([ann["area"] for ann in ann_info["segments_info"]], dtype=np.float32) + + return image, target + + # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize + def _resize(self, image, size, target=None, max_size=None): + """ + Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller + edge of the image will be matched to this number. + + If given, also resize the target accordingly. + """ + if not isinstance(image, Image.Image): + image = self.to_pil_image(image) + + def get_size_with_aspect_ratio(image_size, size, max_size=None): + w, h = image_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + return (oh, ow) + + def get_size(image_size, size, max_size=None): + if isinstance(size, (list, tuple)): + return size + else: + # size returned must be (w, h) since we use PIL to resize images + # so we revert the tuple + return get_size_with_aspect_ratio(image_size, size, max_size)[::-1] + + size = get_size(image.size, size, max_size) + rescaled_image = self.resize(image, size=size) + + if target is None: + return rescaled_image, None + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) + ratio_width, ratio_height = ratios + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32) + target["boxes"] = scaled_boxes + + if "area" in target: + area = target["area"] + scaled_area = area * (ratio_width * ratio_height) + target["area"] = scaled_area + + w, h = size + target["size"] = np.asarray([h, w], dtype=np.int64) + + if "masks" in target: + # use PyTorch as current workaround + # TODO replace by self.resize + masks = torch.from_numpy(target["masks"][:, None]).float() + interpolated_masks = nn.functional.interpolate(masks, size=(h, w), mode="nearest")[:, 0] > 0.5 + target["masks"] = interpolated_masks.numpy() + + return rescaled_image, target + + # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize + def _normalize(self, image, mean, std, target=None): + """ + Normalize the image with a certain mean and std. + + If given, also normalize the target bounding boxes based on the size of the image. + """ + + image = self.normalize(image, mean=mean, std=std) + if target is None: + return image, None + + target = target.copy() + h, w = image.shape[-2:] + + if "boxes" in target: + boxes = target["boxes"] + boxes = corners_to_center_format(boxes) + boxes = boxes / np.asarray([w, h, w, h], dtype=np.float32) + target["boxes"] = boxes + + return image, target + + # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.__call__ + def __call__( + self, + images: ImageInput, + annotations: Union[List[Dict], List[List[Dict]]] = None, + return_segmentation_masks: Optional[bool] = False, + masks_path: Optional[pathlib.Path] = None, + pad_and_return_pixel_mask: Optional[bool] = True, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> BatchFeature: + """ + Main method to prepare for the model one or several image(s) and optional annotations. Images are by default + padded up to the largest image in a batch, and a pixel mask is created that indicates which pixels are + real/which are padding. + + + + NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass + PIL images. + + + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + + annotations (`Dict`, `List[Dict]`, *optional*): + The corresponding annotations in COCO format. + + In case [`DetrFeatureExtractor`] was initialized with `format = "coco_detection"`, the annotations for + each image should have the following format: {'image_id': int, 'annotations': [annotation]}, with the + annotations being a list of COCO object annotations. + + In case [`DetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the annotations for + each image should have the following format: {'image_id': int, 'file_name': str, 'segments_info': + [segment_info]} with segments_info being a list of COCO panoptic annotations. + + return_segmentation_masks (`Dict`, `List[Dict]`, *optional*, defaults to `False`): + Whether to also include instance segmentation masks as part of the labels in case `format = + "coco_detection"`. + + masks_path (`pathlib.Path`, *optional*): + Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only + relevant in case [`DetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`. + + pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`): + Whether or not to pad images up to the largest image in a batch and create a pixel mask. + + If left to the default, will return a pixel mask that is: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor` + objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **pixel_values** -- Pixel values to be fed to a model. + - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if + *"pixel_mask"* is in `self.model_input_names`). + - **labels** -- Optional labels to be fed to a model (when `annotations` are provided) + """ + # Input type checking for clearer error + + valid_images = False + valid_annotations = False + valid_masks_path = False + + # Check that images has a valid type + if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images): + valid_images = True + elif isinstance(images, (list, tuple)): + if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]): + valid_images = True + + if not valid_images: + raise ValueError( + "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), " + "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)." + ) + + is_batched = bool( + isinstance(images, (list, tuple)) + and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0])) + ) + + # Check that annotations has a valid type + if annotations is not None: + if not is_batched: + if self.format == "coco_detection": + if isinstance(annotations, dict) and "image_id" in annotations and "annotations" in annotations: + if isinstance(annotations["annotations"], (list, tuple)): + # an image can have no annotations + if len(annotations["annotations"]) == 0 or isinstance(annotations["annotations"][0], dict): + valid_annotations = True + elif self.format == "coco_panoptic": + if isinstance(annotations, dict) and "image_id" in annotations and "segments_info" in annotations: + if isinstance(annotations["segments_info"], (list, tuple)): + # an image can have no segments (?) + if len(annotations["segments_info"]) == 0 or isinstance( + annotations["segments_info"][0], dict + ): + valid_annotations = True + else: + if isinstance(annotations, (list, tuple)): + if len(images) != len(annotations): + raise ValueError("There must be as many annotations as there are images") + if isinstance(annotations[0], Dict): + if self.format == "coco_detection": + if isinstance(annotations[0]["annotations"], (list, tuple)): + valid_annotations = True + elif self.format == "coco_panoptic": + if isinstance(annotations[0]["segments_info"], (list, tuple)): + valid_annotations = True + + if not valid_annotations: + raise ValueError( + """ + Annotations must of type `Dict` (single image) or `List[Dict]` (batch of images). In case of object + detection, each dictionary should contain the keys 'image_id' and 'annotations', with the latter + being a list of annotations in COCO format. In case of panoptic segmentation, each dictionary + should contain the keys 'file_name', 'image_id' and 'segments_info', with the latter being a list + of annotations in COCO format. + """ + ) + + # Check that masks_path has a valid type + if masks_path is not None: + if self.format == "coco_panoptic": + if isinstance(masks_path, pathlib.Path): + valid_masks_path = True + if not valid_masks_path: + raise ValueError( + "The path to the directory containing the mask PNG files should be provided as a" + " `pathlib.Path` object." + ) + + if not is_batched: + images = [images] + if annotations is not None: + annotations = [annotations] + + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + for idx, (image, target) in enumerate(zip(images, annotations)): + if not isinstance(image, Image.Image): + image = self.to_pil_image(image) + image, target = self.prepare(image, target, return_segmentation_masks, masks_path) + images[idx] = image + annotations[idx] = target + + # transformations (resizing + normalization) + if self.do_resize and self.size is not None: + if annotations is not None: + for idx, (image, target) in enumerate(zip(images, annotations)): + image, target = self._resize(image=image, target=target, size=self.size, max_size=self.max_size) + images[idx] = image + annotations[idx] = target + else: + for idx, image in enumerate(images): + images[idx] = self._resize(image=image, target=None, size=self.size, max_size=self.max_size)[0] + + if self.do_normalize: + if annotations is not None: + for idx, (image, target) in enumerate(zip(images, annotations)): + image, target = self._normalize( + image=image, mean=self.image_mean, std=self.image_std, target=target + ) + images[idx] = image + annotations[idx] = target + else: + images = [ + self._normalize(image=image, mean=self.image_mean, std=self.image_std)[0] for image in images + ] + + if pad_and_return_pixel_mask: + # pad images up to largest image in batch and create pixel_mask + max_size = self._max_by_axis([list(image.shape) for image in images]) + c, h, w = max_size + padded_images = [] + pixel_mask = [] + for image in images: + # create padded image + padded_image = np.zeros((c, h, w), dtype=np.float32) + padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image) + padded_images.append(padded_image) + # create pixel mask + mask = np.zeros((h, w), dtype=np.int64) + mask[: image.shape[1], : image.shape[2]] = True + pixel_mask.append(mask) + images = padded_images + + # return as BatchFeature + data = {} + data["pixel_values"] = images + if pad_and_return_pixel_mask: + data["pixel_mask"] = pixel_mask + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + + if annotations is not None: + # Convert to TensorType + tensor_type = return_tensors + if not isinstance(tensor_type, TensorType): + tensor_type = TensorType(tensor_type) + + if not tensor_type == TensorType.PYTORCH: + raise ValueError("Only PyTorch is supported for the moment.") + else: + if not is_torch_available(): + raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.") + + encoded_inputs["labels"] = [ + {k: torch.from_numpy(v) for k, v in target.items()} for target in annotations + ] + + return encoded_inputs + + # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis + def _max_by_axis(self, the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.pad_and_create_pixel_mask + def pad_and_create_pixel_mask( + self, pixel_values_list: List["torch.Tensor"], return_tensors: Optional[Union[str, TensorType]] = None + ): + """ + Pad images up to the largest image in a batch and create a corresponding `pixel_mask`. + + Args: + pixel_values_list (`List[torch.Tensor]`): + List of images (pixel values) to be padded. Each image should be a tensor of shape (C, H, W). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor` + objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **pixel_values** -- Pixel values to be fed to a model. + - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if + *"pixel_mask"* is in `self.model_input_names`). + + """ + + max_size = self._max_by_axis([list(image.shape) for image in pixel_values_list]) + c, h, w = max_size + padded_images = [] + pixel_mask = [] + for image in pixel_values_list: + # create padded image + padded_image = np.zeros((c, h, w), dtype=np.float32) + padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image) + padded_images.append(padded_image) + # create pixel mask + mask = np.zeros((h, w), dtype=np.int64) + mask[: image.shape[1], : image.shape[2]] = True + pixel_mask.append(mask) + + # return as BatchFeature + data = {"pixel_values": padded_images, "pixel_mask": pixel_mask} + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + + return encoded_inputs + + def post_process(self, outputs, target_sizes): + """ + Converts the output of [`DeformableDetrForObjectDetection`] into the format expected by the COCO api. Only + supports PyTorch. + + Args: + outputs ([`DeformableDetrObjectDetectionOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). For visualization, this should be the image size + after data augment, but before padding. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if len(out_logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + prob = out_logits.sigmoid() + topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) + scores = topk_values + topk_boxes = topk_indexes // out_logits.shape[2] + labels = topk_indexes % out_logits.shape[2] + boxes = center_to_corners_format(out_bbox) + boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) + + # and from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) + boxes = boxes * scale_fct[:, None, :] + + results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] + + return results + + # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_segmentation with Detr->DeformableDetr + def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5): + """ + Converts the output of [`DeformableDetrForSegmentation`] into image segmentation predictions. Only supports + PyTorch. + + Parameters: + outputs ([`DeformableDetrSegmentationOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`): + Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. + threshold (`float`, *optional*, defaults to 0.9): + Threshold to use to filter out queries. + mask_threshold (`float`, *optional*, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image + in the batch as predicted by the model. + """ + out_logits, raw_masks = outputs.logits, outputs.pred_masks + preds = [] + + def to_tuple(tup): + if isinstance(tup, tuple): + return tup + return tuple(tup.cpu().tolist()) + + for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes): + # we filter empty queries and detection below threshold + scores, labels = cur_logits.softmax(-1).max(-1) + keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold) + cur_scores, cur_classes = cur_logits.softmax(-1).max(-1) + cur_scores = cur_scores[keep] + cur_classes = cur_classes[keep] + cur_masks = cur_masks[keep] + cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) + cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1 + + predictions = {"scores": cur_scores, "labels": cur_classes, "masks": cur_masks} + preds.append(predictions) + return preds + + # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_instance with Detr->DeformableDetr + def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5): + """ + Converts the output of [`DeformableDetrForSegmentation`] into actual instance segmentation predictions. Only + supports PyTorch. + + Args: + results (`List[Dict]`): + Results list obtained by [`~DeformableDetrFeatureExtractor.post_process`], to which "masks" results + will be added. + outputs ([`DeformableDetrSegmentationOutput`]): + Raw outputs of the model. + orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original + image size (before any data augmentation). + max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). + threshold (`float`, *optional*, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an + image in the batch as predicted by the model. + """ + + if len(orig_target_sizes) != len(max_target_sizes): + raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes") + max_h, max_w = max_target_sizes.max(0)[0].tolist() + outputs_masks = outputs.pred_masks.squeeze(2) + outputs_masks = nn.functional.interpolate( + outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False + ) + outputs_masks = (outputs_masks.sigmoid() > threshold).cpu() + + for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)): + img_h, img_w = t[0], t[1] + results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1) + results[i]["masks"] = nn.functional.interpolate( + results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest" + ).byte() + + return results + + # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_panoptic with Detr->DeformableDetr + def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85): + """ + Converts the output of [`DeformableDetrForSegmentation`] into actual panoptic predictions. Only supports + PyTorch. + + Parameters: + outputs ([`DeformableDetrSegmentationOutput`]): + Raw outputs of the model. + processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`): + Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data + augmentation but before batching. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`, *optional*): + Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. If left to + None, it will default to the `processed_sizes`. + is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*): + Dictionary mapping class indices to either True or False, depending on whether or not they are a thing. + If not set, defaults to the `is_thing_map` of COCO panoptic. + threshold (`float`, *optional*, defaults to 0.85): + Threshold to use to filter out queries. + + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for + an image in the batch as predicted by the model. + """ + if target_sizes is None: + target_sizes = processed_sizes + if len(processed_sizes) != len(target_sizes): + raise ValueError("Make sure to pass in as many processed_sizes as target_sizes") + + if is_thing_map is None: + # default to is_thing_map of COCO panoptic + is_thing_map = {i: i <= 90 for i in range(201)} + + out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes + if not len(out_logits) == len(raw_masks) == len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks" + ) + preds = [] + + def to_tuple(tup): + if isinstance(tup, tuple): + return tup + return tuple(tup.cpu().tolist()) + + for cur_logits, cur_masks, cur_boxes, size, target_size in zip( + out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes + ): + # we filter empty queries and detection below threshold + scores, labels = cur_logits.softmax(-1).max(-1) + keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold) + cur_scores, cur_classes = cur_logits.softmax(-1).max(-1) + cur_scores = cur_scores[keep] + cur_classes = cur_classes[keep] + cur_masks = cur_masks[keep] + cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) + cur_boxes = center_to_corners_format(cur_boxes[keep]) + + h, w = cur_masks.shape[-2:] + if len(cur_boxes) != len(cur_classes): + raise ValueError("Not as many boxes as there are classes") + + # It may be that we have several predicted masks for the same stuff class. + # In the following, we track the list of masks ids for each stuff class (they are merged later on) + cur_masks = cur_masks.flatten(1) + stuff_equiv_classes = defaultdict(lambda: []) + for k, label in enumerate(cur_classes): + if not is_thing_map[label.item()]: + stuff_equiv_classes[label.item()].append(k) + + def get_ids_area(masks, scores, dedup=False): + # This helper function creates the final panoptic segmentation image + # It also returns the area of the masks that appears on the image + + m_id = masks.transpose(0, 1).softmax(-1) + + if m_id.shape[-1] == 0: + # We didn't detect any mask :( + m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device) + else: + m_id = m_id.argmax(-1).view(h, w) + + if dedup: + # Merge the masks corresponding to the same stuff class + for equiv in stuff_equiv_classes.values(): + if len(equiv) > 1: + for eq_id in equiv: + m_id.masked_fill_(m_id.eq(eq_id), equiv[0]) + + final_h, final_w = to_tuple(target_size) + + seg_img = Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy())) + seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST) + + np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())) + np_seg_img = np_seg_img.view(final_h, final_w, 3) + np_seg_img = np_seg_img.numpy() + + m_id = torch.from_numpy(rgb_to_id(np_seg_img)) + + area = [] + for i in range(len(scores)): + area.append(m_id.eq(i).sum().item()) + return area, seg_img + + area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True) + if cur_classes.numel() > 0: + # We know filter empty masks as long as we find some + while True: + filtered_small = torch.as_tensor( + [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device + ) + if filtered_small.any().item(): + cur_scores = cur_scores[~filtered_small] + cur_classes = cur_classes[~filtered_small] + cur_masks = cur_masks[~filtered_small] + area, seg_img = get_ids_area(cur_masks, cur_scores) + else: + break + + else: + cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device) + + segments_info = [] + for i, a in enumerate(area): + cat = cur_classes[i].item() + segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a}) + del cur_classes + + with io.BytesIO() as out: + seg_img.save(out, format="PNG") + predictions = {"png_string": out.getvalue(), "segments_info": segments_info} + preds.append(predictions) + return preds diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index 5038cd391c2b3c..0b176a318c4c03 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -1884,15 +1884,15 @@ def forward( >>> results = feature_extractor.post_process(outputs, target_sizes=target_sizes)[0] >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): ... box = [round(i, 2) for i in box.tolist()] - ... # let's only keep detections with score > 0.7 - ... if score > 0.7: + ... # let's only keep detections with score > 0.5 + ... if score > 0.5: ... print( ... f"Detected {model.config.id2label[label.item()]} with confidence " ... f"{round(score.item(), 3)} at location {box}" ... ) - Detected cat with confidence 0.856 at location [342.19, 24.3, 640.02, 372.25] - Detected remote with confidence 0.739 at location [40.79, 72.78, 176.76, 117.25] - Detected cat with confidence 0.859 at location [16.5, 52.84, 318.25, 470.78] + Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78] + Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25] + Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25] ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py index 91e406c71fc944..4377bd6f8d60e9 100644 --- a/src/transformers/models/detr/feature_extraction_detr.py +++ b/src/transformers/models/detr/feature_extraction_detr.py @@ -137,7 +137,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size * height / width, size)`. - max_size (`int`, *optional*, defaults to `1333`): + max_size (`int`, *optional*, defaults to 1333): The largest size an image dimension can have (otherwise it's capped). Only has an effect if `do_resize` is set to `True`. do_normalize (`bool`, *optional*, defaults to `True`): @@ -683,9 +683,9 @@ def post_process(self, outputs, target_sizes): outputs ([`DetrObjectDetectionOutput`]): Raw outputs of the model. target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original - image size (before any data augmentation). For visualization, this should be the image size after data - augment, but before padding. + Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). For visualization, this should be the image size + after data augment, but before padding. Returns: `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image diff --git a/src/transformers/models/yolos/feature_extraction_yolos.py b/src/transformers/models/yolos/feature_extraction_yolos.py index e199d1ae7bf463..616d8e4849dc4d 100644 --- a/src/transformers/models/yolos/feature_extraction_yolos.py +++ b/src/transformers/models/yolos/feature_extraction_yolos.py @@ -666,9 +666,9 @@ def post_process(self, outputs, target_sizes): outputs ([`DetrObjectDetectionOutput`]): Raw outputs of the model. target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original - image size (before any data augmentation). For visualization, this should be the image size after data - augment, but before padding. + Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). For visualization, this should be the image size + after data augment, but before padding. Returns: `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index d2ec5be33ceb8c..2d1f0a88cd0ebf 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -38,6 +38,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class DeformableDetrFeatureExtractor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class DeiTFeatureExtractor(metaclass=DummyObject): _backends = ["vision"] diff --git a/tests/models/deformable_detr/test_feature_extraction_deformable_detr.py b/tests/models/deformable_detr/test_feature_extraction_deformable_detr.py new file mode 100644 index 00000000000000..6a7cfefee52c0d --- /dev/null +++ b/tests/models/deformable_detr/test_feature_extraction_deformable_detr.py @@ -0,0 +1,341 @@ +# coding=utf-8 +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import pathlib +import unittest + +import numpy as np + +from transformers.testing_utils import require_torch, require_vision, slow +from transformers.utils import is_torch_available, is_vision_available + +from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import DeformableDetrFeatureExtractor + + +class DeformableDetrFeatureExtractionTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=18, + max_size=1333, # by setting max_size > max_resolution we're effectively not testing this :p + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + ): + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.max_size = max_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + + def prepare_feat_extract_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "max_size": self.max_size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + } + + def get_expected_values(self, image_inputs, batched=False): + """ + This function computes the expected height and width when providing images to DeformableDetrFeatureExtractor, + assuming do_resize is set to True with a scalar size. + """ + if not batched: + image = image_inputs[0] + if isinstance(image, Image.Image): + w, h = image.size + else: + h, w = image.shape[1], image.shape[2] + if w < h: + expected_height = int(self.size * h / w) + expected_width = self.size + elif w > h: + expected_height = self.size + expected_width = int(self.size * w / h) + else: + expected_height = self.size + expected_width = self.size + + else: + expected_values = [] + for image in image_inputs: + expected_height, expected_width = self.get_expected_values([image]) + expected_values.append((expected_height, expected_width)) + expected_height = max(expected_values, key=lambda item: item[0])[0] + expected_width = max(expected_values, key=lambda item: item[1])[1] + + return expected_height, expected_width + + +@require_torch +@require_vision +class DeformableDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase): + + feature_extraction_class = DeformableDetrFeatureExtractor if is_vision_available() else None + + def setUp(self): + self.feature_extract_tester = DeformableDetrFeatureExtractionTester(self) + + @property + def feat_extract_dict(self): + return self.feature_extract_tester.prepare_feat_extract_dict() + + def test_feat_extract_properties(self): + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + self.assertTrue(hasattr(feature_extractor, "image_mean")) + self.assertTrue(hasattr(feature_extractor, "image_std")) + self.assertTrue(hasattr(feature_extractor, "do_normalize")) + self.assertTrue(hasattr(feature_extractor, "do_resize")) + self.assertTrue(hasattr(feature_extractor, "size")) + self.assertTrue(hasattr(feature_extractor, "max_size")) + + def test_batch_feature(self): + pass + + def test_call_pil(self): + # Initialize feature_extractor + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + # create random PIL images + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False) + for image in image_inputs: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values + + expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs) + + self.assertEqual( + encoded_images.shape, + (1, self.feature_extract_tester.num_channels, expected_height, expected_width), + ) + + # Test batched + expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True) + + encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + self.feature_extract_tester.batch_size, + self.feature_extract_tester.num_channels, + expected_height, + expected_width, + ), + ) + + def test_call_numpy(self): + # Initialize feature_extractor + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + # create random numpy tensors + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values + + expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs) + + self.assertEqual( + encoded_images.shape, + (1, self.feature_extract_tester.num_channels, expected_height, expected_width), + ) + + # Test batched + encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values + + expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True) + + self.assertEqual( + encoded_images.shape, + ( + self.feature_extract_tester.batch_size, + self.feature_extract_tester.num_channels, + expected_height, + expected_width, + ), + ) + + def test_call_pytorch(self): + # Initialize feature_extractor + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + # create random PyTorch tensors + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True) + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values + + expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs) + + self.assertEqual( + encoded_images.shape, + (1, self.feature_extract_tester.num_channels, expected_height, expected_width), + ) + + # Test batched + encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values + + expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True) + + self.assertEqual( + encoded_images.shape, + ( + self.feature_extract_tester.batch_size, + self.feature_extract_tester.num_channels, + expected_height, + expected_width, + ), + ) + + def test_equivalence_pad_and_create_pixel_mask(self): + # Initialize feature_extractors + feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict) + feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False) + # create random PyTorch tensors + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True) + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test whether the method "pad_and_return_pixel_mask" and calling the feature extractor return the same tensors + encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt") + encoded_images = feature_extractor_2(image_inputs, return_tensors="pt") + + self.assertTrue( + torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) + ) + self.assertTrue( + torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4) + ) + + @slow + def test_call_pytorch_with_coco_detection_annotations(self): + # prepare image and target + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f: + target = json.loads(f.read()) + + target = {"image_id": 39769, "annotations": target} + + # encode them + feature_extractor = DeformableDetrFeatureExtractor() + encoding = feature_extractor(images=image, annotations=target, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) + + # verify area + expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438]) + self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) + # verify image_id + expected_image_id = torch.tensor([39769]) + self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) + # verify class_labels + expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) + self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) + # verify size + expected_size = torch.tensor([800, 1066]) + self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) + + @slow + def test_call_pytorch_with_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f: + target = json.loads(f.read()) + + target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") + + # encode them + feature_extractor = DeformableDetrFeatureExtractor(format="coco_panoptic") + encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) + + # verify area + expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]) + self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625]) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) + # verify image_id + expected_image_id = torch.tensor([39769]) + self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) + # verify class_labels + expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93]) + self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) + # verify masks + expected_masks_sum = 822338 + self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) + # verify size + expected_size = torch.tensor([800, 1066]) + self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) diff --git a/tests/models/detr/test_feature_extraction_detr.py b/tests/models/detr/test_feature_extraction_detr.py index 58bde80fbbb11e..30fae36ed83214 100644 --- a/tests/models/detr/test_feature_extraction_detr.py +++ b/tests/models/detr/test_feature_extraction_detr.py @@ -240,8 +240,12 @@ def test_equivalence_pad_and_create_pixel_mask(self): encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt") encoded_images = feature_extractor_2(image_inputs, return_tensors="pt") - assert torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) - assert torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4) + self.assertTrue( + torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) + ) + self.assertTrue( + torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4) + ) @slow def test_call_pytorch_with_coco_detection_annotations(self): @@ -261,31 +265,31 @@ def test_call_pytorch_with_coco_detection_annotations(self): self.assertEqual(encoding["pixel_values"].shape, expected_shape) expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) - assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) # verify area expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438]) - assert torch.allclose(encoding["labels"][0]["area"], expected_area) + self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) # verify boxes expected_boxes_shape = torch.Size([6, 4]) self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) - assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) # verify image_id expected_image_id = torch.tensor([39769]) - assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id) + self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) # verify is_crowd expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) - assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd) + self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) # verify class_labels expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) - assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels) + self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) # verify orig_size expected_orig_size = torch.tensor([480, 640]) - assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size) + self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) # verify size expected_size = torch.tensor([800, 1066]) - assert torch.allclose(encoding["labels"][0]["size"], expected_size) + self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) @slow def test_call_pytorch_with_coco_panoptic_annotations(self): @@ -299,8 +303,7 @@ def test_call_pytorch_with_coco_panoptic_annotations(self): masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") # encode them - # TODO replace by .from_pretrained facebook/detr-resnet-50-panoptic - feature_extractor = DetrFeatureExtractor(format="coco_panoptic") + feature_extractor = DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50-panoptic") encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") # verify pixel values @@ -308,31 +311,31 @@ def test_call_pytorch_with_coco_panoptic_annotations(self): self.assertEqual(encoding["pixel_values"].shape, expected_shape) expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) - assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) # verify area expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]) - assert torch.allclose(encoding["labels"][0]["area"], expected_area) + self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) # verify boxes expected_boxes_shape = torch.Size([6, 4]) self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625]) - assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) # verify image_id expected_image_id = torch.tensor([39769]) - assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id) + self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) # verify is_crowd expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) - assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd) + self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) # verify class_labels expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93]) - assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels) + self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) # verify masks expected_masks_sum = 822338 self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum) # verify orig_size expected_orig_size = torch.tensor([480, 640]) - assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size) + self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) # verify size expected_size = torch.tensor([800, 1066]) - assert torch.allclose(encoding["labels"][0]["size"], expected_size) + self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) diff --git a/tests/models/yolos/test_feature_extraction_yolos.py b/tests/models/yolos/test_feature_extraction_yolos.py index 8a576a583a9af0..1c3805e8cdaccd 100644 --- a/tests/models/yolos/test_feature_extraction_yolos.py +++ b/tests/models/yolos/test_feature_extraction_yolos.py @@ -240,7 +240,9 @@ def test_equivalence_padding(self): encoded_images_with_method = feature_extractor_1.pad(image_inputs, return_tensors="pt") encoded_images = feature_extractor_2(image_inputs, return_tensors="pt") - assert torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) + self.assertTrue( + torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) + ) @slow def test_call_pytorch_with_coco_detection_annotations(self): @@ -260,31 +262,31 @@ def test_call_pytorch_with_coco_detection_annotations(self): self.assertEqual(encoding["pixel_values"].shape, expected_shape) expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) - assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) # verify area expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438]) - assert torch.allclose(encoding["labels"][0]["area"], expected_area) + self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) # verify boxes expected_boxes_shape = torch.Size([6, 4]) self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) - assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) # verify image_id expected_image_id = torch.tensor([39769]) - assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id) + self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) # verify is_crowd expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) - assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd) + self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) # verify class_labels expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) - assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels) + self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) # verify orig_size expected_orig_size = torch.tensor([480, 640]) - assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size) + self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) # verify size expected_size = torch.tensor([800, 1066]) - assert torch.allclose(encoding["labels"][0]["size"], expected_size) + self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) @slow def test_call_pytorch_with_coco_panoptic_annotations(self): @@ -306,31 +308,31 @@ def test_call_pytorch_with_coco_panoptic_annotations(self): self.assertEqual(encoding["pixel_values"].shape, expected_shape) expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) - assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) # verify area expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]) - assert torch.allclose(encoding["labels"][0]["area"], expected_area) + self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) # verify boxes expected_boxes_shape = torch.Size([6, 4]) self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625]) - assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) # verify image_id expected_image_id = torch.tensor([39769]) - assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id) + self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) # verify is_crowd expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) - assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd) + self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) # verify class_labels expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93]) - assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels) + self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) # verify masks expected_masks_sum = 822338 self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum) # verify orig_size expected_orig_size = torch.tensor([480, 640]) - assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size) + self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) # verify size expected_size = torch.tensor([800, 1066]) - assert torch.allclose(encoding["labels"][0]["size"], expected_size) + self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))