From 69df33f18076f4e509e09f8697c2f4bb0c063a85 Mon Sep 17 00:00:00 2001 From: Alara Dirik <8944735+alaradirik@users.noreply.github.com> Date: Tue, 13 Sep 2022 09:36:03 +0300 Subject: [PATCH] Fix MaskFormerFeatureExtractor instance segmentation preprocessing bug (#18997) * fix preprocessing for instance segmentation maps * add support for per-image instance2class_id mapping * edit docstrings for clarity --- .../feature_extraction_maskformer.py | 103 ++++++++++-------- 1 file changed, 60 insertions(+), 43 deletions(-) diff --git a/src/transformers/models/maskformer/feature_extraction_maskformer.py b/src/transformers/models/maskformer/feature_extraction_maskformer.py index 3a5fd49d80fa7..8514bb26da2ac 100644 --- a/src/transformers/models/maskformer/feature_extraction_maskformer.py +++ b/src/transformers/models/maskformer/feature_extraction_maskformer.py @@ -69,11 +69,12 @@ class MaskFormerFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the ImageNet std. ignore_index (`int`, *optional*): - Value of the index (label) to be removed from the segmentation maps. + Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels + denoted with 0 (background) will be replaced with `ignore_index`. reduce_labels (`bool`, *optional*, defaults to `False`): - Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is - used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The - background label will be replaced by `ignore_index`. + Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). + The background label will be replaced by `ignore_index`. """ @@ -162,7 +163,7 @@ def __call__( images: ImageInput, segmentation_maps: ImageInput = None, pad_and_return_pixel_mask: Optional[bool] = True, - instance_id_to_semantic_id: Optional[Dict[int, int]] = None, + instance_id_to_semantic_id: Optional[Union[List[Dict[int, int]], Dict[int, int]]] = None, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs, ) -> BatchFeature: @@ -191,7 +192,9 @@ def __call__( number of channels, H and W are image height and width. segmentation_maps (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*): - Optionally, the corresponding semantic segmentation maps with the pixel-wise annotations. + The corresponding semantic segmentation maps with the pixel-wise class id annotations or instance + segmentation maps with pixel-wise instance id annotations. Assumed to be semantic segmentation maps if + no `instance_id_to_semantic_id map` is provided. pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`): Whether or not to pad images up to the largest image in a batch and create a pixel mask. @@ -201,10 +204,11 @@ def __call__( - 1 for pixels that are real (i.e. **not masked**), - 0 for pixels that are padding (i.e. **masked**). - instance_id_to_semantic_id (`Dict[int, int]`, *optional*): - If passed, we treat `segmentation_maps` as an instance segmentation map where each pixel represents an - instance id. To convert it to a binary mask of shape (`batch, num_labels, height, width`) we need a - dictionary mapping instance ids to label ids to create a semantic segmentation map. + instance_id_to_semantic_id (`List[Dict[int, int]]` or `Dict[int, int]`, *optional*): + A mapping between object instance ids and class ids. If passed, `segmentation_maps` is treated as an + instance segmentation map where each pixel represents an instance id. Can be provided as a single + dictionary with a global / dataset-level mapping or as a list of dictionaries (one per image), to map + instance ids in each image separately. return_tensors (`str` or [`~file_utils.TensorType`], *optional*): If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor` @@ -215,11 +219,11 @@ def __call__( - **pixel_values** -- Pixel values to be fed to a model. - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if - *"pixel_mask"* is in `self.model_input_names`). - - **mask_labels** -- Optional list of mask labels of shape `(labels, height, width)` to be fed to a model - (when `annotations` are provided). - - **class_labels** -- Optional list of class labels of shape `(labels)` to be fed to a model (when - `annotations` are provided). They identify the labels of `mask_labels`, e.g. the label of + `pixel_mask` is in `self.model_input_names`). + - **mask_labels** -- Optional list of mask labels of shape `(num_class_labels, height, width)` to be fed to + a model (when `annotations` are provided). + - **class_labels** -- Optional list of class labels of shape `(num_class_labels)` to be fed to a model + (when `annotations` are provided). They identify the labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`. """ # Input type checking for clearer error @@ -319,26 +323,31 @@ def convert_segmentation_map_to_binary_masks( segmentation_map: "np.ndarray", instance_id_to_semantic_id: Optional[Dict[int, int]] = None, ): + # Get unique ids (class or instance ids based on input) + all_labels = np.unique(segmentation_map) + + # Drop background label if applicable if self.reduce_labels: - if self.ignore_index is None: - raise ValueError("`ignore_index` must be set when `reduce_labels` is `True`.") - segmentation_map[segmentation_map == 0] = self.ignore_index - # instances ids start from 1! - segmentation_map -= 1 - segmentation_map[segmentation_map == self.ignore_index - 1] = self.ignore_index + all_labels = all_labels[all_labels != 0] + + # Generate a binary mask for each object instance + binary_masks = [np.ma.masked_where(segmentation_map == i, segmentation_map) for i in all_labels] + binary_masks = np.stack(binary_masks, axis=0) # (num_labels, height, width) + # Convert instance ids to class ids if instance_id_to_semantic_id is not None: - # segmentation_map will be treated as an instance segmentation map where each pixel is a instance id - # thus it has to be converted to a semantic segmentation map - for instance_id, label_id in instance_id_to_semantic_id.items(): - segmentation_map[segmentation_map == instance_id] = label_id - # get all the labels in the image - labels = np.unique(segmentation_map) - # remove ignore index (if we have one) - if self.ignore_index is not None: - labels = labels[labels != self.ignore_index] - # helping broadcast by making mask [1,W,H] and labels [C, 1, 1] - binary_masks = segmentation_map[None] == labels[:, None, None] + labels = np.zeros(all_labels.shape[0]) + + for label in all_labels: + class_id = instance_id_to_semantic_id[label] + labels[all_labels == label] = class_id + else: + labels = all_labels + + # Decrement labels by 1 + if self.reduce_labels: + labels -= 1 + return binary_masks.astype(np.float32), labels.astype(np.int64) def encode_inputs( @@ -346,7 +355,7 @@ def encode_inputs( pixel_values_list: List["np.ndarray"], segmentation_maps: ImageInput = None, pad_and_return_pixel_mask: bool = True, - instance_id_to_semantic_id: Optional[Dict[int, int]] = None, + instance_id_to_semantic_id: Optional[Union[List[Dict[int, int]], Dict[int, int]]] = None, return_tensors: Optional[Union[str, TensorType]] = None, ): """ @@ -374,10 +383,11 @@ def encode_inputs( - 1 for pixels that are real (i.e. **not masked**), - 0 for pixels that are padding (i.e. **masked**). - instance_id_to_semantic_id (`Dict[int, int]`, *optional*): - If passed, we treat `segmentation_maps` as an instance segmentation map where each pixel represents an - instance id. To convert it to a binary mask of shape (`batch, num_labels, height, width`) we need a - dictionary mapping instance ids to label ids to create a semantic segmentation map. + instance_id_to_semantic_id (`List[Dict[int, int]]` or `Dict[int, int]`, *optional*): + A mapping between object instance ids and class ids. If passed, `segmentation_maps` is treated as an + instance segmentation map where each pixel represents an instance id. Can be provided as a single + dictionary with a global/dataset-level mapping or as a list of dictionaries (one per image), to map + instance ids in each image separately. return_tensors (`str` or [`~file_utils.TensorType`], *optional*): If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor` @@ -388,7 +398,7 @@ def encode_inputs( - **pixel_values** -- Pixel values to be fed to a model. - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if - *"pixel_mask"* is in `self.model_input_names`). + `pixel_mask` is in `self.model_input_names`). - **mask_labels** -- Optional list of mask labels of shape `(labels, height, width)` to be fed to a model (when `annotations` are provided). - **class_labels** -- Optional list of class labels of shape `(labels)` to be fed to a model (when @@ -402,10 +412,17 @@ def encode_inputs( if segmentation_maps is not None: segmentation_maps = map(np.array, segmentation_maps) converted_segmentation_maps = [] - for segmentation_map in segmentation_maps: - converted_segmentation_map = self.convert_segmentation_map_to_binary_masks( - segmentation_map, instance_id_to_semantic_id - ) + + for i, segmentation_map in enumerate(segmentation_maps): + # Use instance2class_id mapping per image + if isinstance(instance_id_to_semantic_id, List): + converted_segmentation_map = self.convert_segmentation_map_to_binary_masks( + segmentation_map, instance_id_to_semantic_id[i] + ) + else: + converted_segmentation_map = self.convert_segmentation_map_to_binary_masks( + segmentation_map, instance_id_to_semantic_id + ) converted_segmentation_maps.append(converted_segmentation_map) annotations = [] @@ -469,7 +486,7 @@ def post_process_segmentation( Returns: `torch.Tensor`: - A tensor of shape (`batch_size, num_labels, height, width`). + A tensor of shape (`batch_size, num_class_labels, height, width`). """ # class_queries_logits has shape [BATCH, QUERIES, CLASSES + 1] class_queries_logits = outputs.class_queries_logits