From a74eed80fdaf7ddfcd072ff3c61b068862727724 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 23 Oct 2022 13:04:17 +0200
Subject: [PATCH 01/47] Add first draft

---
 docs/source/en/model_doc/clipseg.mdx          |   60 +
 src/transformers/__init__.py                  |   32 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 .../models/auto/feature_extraction_auto.py    |    1 +
 src/transformers/models/auto/modeling_auto.py |    2 +
 .../models/auto/processing_auto.py            |    1 +
 .../models/auto/tokenization_auto.py          |    6 +
 src/transformers/models/clipseg/__init__.py   |   77 ++
 .../models/clipseg/configuration_clipseg.py   |  395 ++++++
 .../convert_clipseg_original_pytorch_to_hf.py |  148 +++
 .../models/clipseg/modeling_clipseg.py        | 1086 +++++++++++++++++
 tests/models/clipseg/__init__.py              |    0
 tests/models/clipseg/test_modeling_clipseg.py |  674 ++++++++++
 14 files changed, 2486 insertions(+)
 create mode 100644 docs/source/en/model_doc/clipseg.mdx
 create mode 100644 src/transformers/models/clipseg/__init__.py
 create mode 100644 src/transformers/models/clipseg/configuration_clipseg.py
 create mode 100644 src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
 create mode 100644 src/transformers/models/clipseg/modeling_clipseg.py
 create mode 100644 tests/models/clipseg/__init__.py
 create mode 100644 tests/models/clipseg/test_modeling_clipseg.py
diff --git a/docs/source/en/model_doc/clipseg.mdx b/docs/source/en/model_doc/clipseg.mdx
new file mode 100644
index 0000000000000..942de42d86703
--- /dev/null
+++ b/docs/source/en/model_doc/clipseg.mdx
@@ -0,0 +1,60 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# CLIPSeg
+
+## Overview
+
+The CLIPSeg model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## CLIPSegConfig
+
+[[autodoc]] CLIPSegConfig
+    - from_text_vision_configs
+
+## CLIPSegTextConfig
+
+[[autodoc]] CLIPSegTextConfig
+
+## CLIPSegVisionConfig
+
+[[autodoc]] CLIPSegVisionConfig
+
+## CLIPSegModel
+
+[[autodoc]] CLIPSegModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## CLIPSegTextModel
+
+[[autodoc]] CLIPSegTextModel
+    - forward
+
+## CLIPSegVisionModel
+
+[[autodoc]] CLIPSegVisionModel
+    - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index a3ce3fd1eb2ee..8de4f2c9da284 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -171,6 +171,14 @@
         "CLIPTokenizer",
         "CLIPVisionConfig",
     ],
+    "models.clipseg": [
+        "CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "CLIPSegConfig",
+       
+        "CLIPSegTextConfig",
+       
+        "CLIPSegVisionConfig",
+    ],
     "models.codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenTokenizer"],
     "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDetrConfig"],
     "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
@@ -1074,6 +1082,15 @@
             "CLIPVisionModel",
         ]
     )
+    _import_structure["models.clipseg"].extend(
+        [
+            "CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "CLIPSegModel",
+            "CLIPSegPreTrainedModel",
+            "CLIPSegTextModel",
+            "CLIPSegVisionModel",
+        ]
+    )
     _import_structure["models.x_clip"].extend(
         [
             "XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3225,6 +3242,14 @@
         CLIPTokenizer,
         CLIPVisionConfig,
     )
+    from .models.clipseg import (
+        CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CLIPSegConfig,
+       
+        CLIPSegTextConfig,
+       
+        CLIPSegVisionConfig,
+    )
     from .models.codegen import CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP, CodeGenConfig, CodeGenTokenizer
     from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDetrConfig
     from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
@@ -3993,6 +4018,13 @@
             CLIPTextModel,
             CLIPVisionModel,
         )
+        from .models.clipseg import (
+            CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST,
+            CLIPSegModel,
+            CLIPSegPreTrainedModel,
+            CLIPSegTextModel,
+            CLIPSegVisionModel,
+        )
         from .models.codegen import (
             CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST,
             CodeGenForCausalLM,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 86a775a1eb2b8..03153725125cc 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -37,6 +37,7 @@
     camembert,
     canine,
     clip,
+    clipseg,
     codegen,
     conditional_detr,
     convbert,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 68f29f89ae50e..0f7a83970bc8d 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -42,6 +42,7 @@
         ("camembert", "CamembertConfig"),
         ("canine", "CanineConfig"),
         ("clip", "CLIPConfig"),
+        ("clipseg", "CLIPSegConfig"),
         ("codegen", "CodeGenConfig"),
         ("conditional_detr", "ConditionalDetrConfig"),
         ("convbert", "ConvBertConfig"),
@@ -182,6 +183,7 @@
         ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("clip", "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("clipseg", "CLIPSegSEG_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("codegen", "CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("conditional_detr", "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convbert", "CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -315,6 +317,7 @@
         ("camembert", "CamemBERT"),
         ("canine", "CANINE"),
         ("clip", "CLIP"),
+        ("clipseg", "CLIPSegSeg"),
         ("codegen", "CodeGen"),
         ("conditional_detr", "Conditional DETR"),
         ("convbert", "ConvBERT"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 76d38f95ab151..ac98d57e4ff8f 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,6 +39,7 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
+        ("clipseg", "CLIPSegFeatureExtractor"),
         ("conditional_detr", "ConditionalDetrFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 3da1dc1790572..7b6e701175859 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -41,6 +41,7 @@
         ("camembert", "CamembertModel"),
         ("canine", "CanineModel"),
         ("clip", "CLIPModel"),
+        ("clipseg", "CLIPSegModel"),
         ("codegen", "CodeGenModel"),
         ("conditional_detr", "ConditionalDetrModel"),
         ("convbert", "ConvBertModel"),
@@ -813,6 +814,7 @@
     [
         # Model for Zero Shot Image Classification mapping
         ("clip", "CLIPModel"),
+        ("clipseg", "CLIPSegModel"),
     ]
 )
 
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 3e31a14d25817..f7bb87e25e2d7 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -40,6 +40,7 @@
 PROCESSOR_MAPPING_NAMES = OrderedDict(
     [
         ("clip", "CLIPProcessor"),
+        ("clipseg", "CLIPSegProcessor"),
         ("flava", "FlavaProcessor"),
         ("groupvit", "CLIPProcessor"),
         ("layoutlmv2", "LayoutLMv2Processor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 46e57ac58bd4c..65c0c6a5ca509 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -93,6 +93,12 @@
                     "CLIPTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            (
+                "clipseg",
+                (
+                    "CLIPTokenizer",
+                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
+                ),
             ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
             ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
             (
diff --git a/src/transformers/models/clipseg/__init__.py b/src/transformers/models/clipseg/__init__.py
new file mode 100644
index 0000000000000..2e1f9bd2e6423
--- /dev/null
+++ b/src/transformers/models/clipseg/__init__.py
@@ -0,0 +1,77 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_clipseg": [
+        "CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "CLIPSegConfig",
+        "CLIPSegOnnxConfig",
+        "CLIPSegTextConfig",
+        "CLIPSegVisionConfig",
+    ],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_clipseg"] = [
+        "CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "CLIPSegModel",
+        "CLIPSegPreTrainedModel",
+        "CLIPSegTextModel",
+        "CLIPSegVisionModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_clipseg import (
+        CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CLIPSegConfig,
+        CLIPSegOnnxConfig,
+        CLIPSegTextConfig,
+        CLIPSegVisionConfig,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_clipseg import (
+            CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST,
+            CLIPSegModel,
+            CLIPSegPreTrainedModel,
+            CLIPSegTextModel,
+            CLIPSegVisionModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
new file mode 100644
index 0000000000000..0453de57ebf32
--- /dev/null
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -0,0 +1,395 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CLIPSeg model configuration"""
+
+import copy
+import os
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+
+
+if TYPE_CHECKING:
+    from ...processing_utils import ProcessorMixin
+    from ...utils import TensorType
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "organization/clipseg-rd64-uni": "https://huggingface.co/organization/clipseg-rd64-uni/resolve/main/config.json",
+}
+
+
+
+class CLIPSegTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an CLIPSeg
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the CLIPSeg
+    [organization/clipseg-rd64-uni](https://huggingface.co/organization/clipseg-rd64-uni) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the CLIPSeg text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`CLIPSegModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
+            defaults to 1e-5): The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPSegTextConfig, CLIPSegTextModel
+
+    >>> # Initializing a CLIPSegTextConfig with organization/clipseg-rd64-uni style configuration
+    >>> configuration = CLIPSegTextConfig()
+
+    >>> # Initializing a CLIPSegTextModel (with random weights) from the organization/clipseg-rd64-uni style configuration
+    >>> model = CLIPSegTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "clipseg_text_model"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPSegConfig
+        if config_dict.get("model_type") == "clipseg":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class CLIPSegVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an CLIPSeg
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the CLIPSeg
+    [organization/clipseg-rd64-uni](https://huggingface.co/organization/clipseg-rd64-uni) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
+            defaults to 1e-5): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPSegVisionConfig, CLIPSegVisionModel
+
+    >>> # Initializing a CLIPSegVisionConfig with organization/clipseg-rd64-uni style configuration
+    >>> configuration = CLIPSegVisionConfig()
+
+    >>> # Initializing a CLIPSegVisionModel (with random weights) from the organization/clipseg-rd64-uni style configuration
+    >>> model = CLIPSegVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "clipseg_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from CLIPSegConfig
+        if config_dict.get("model_type") == "clipseg":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class CLIPSegConfig(PretrainedConfig):
+    r"""
+    [`CLIPSegConfig`] is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate
+    CLIPSeg model according to the specified arguments, defining the text model and vision model configs.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config_dict (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPSegTextConfig`].
+        vision_config_dict (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPSegVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIPSeg implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPSegConfig, CLIPSegModel
+
+    >>> # Initializing a CLIPSegConfig with organization/clipseg-rd64-uni style configuration
+    >>> configuration = CLIPSegConfig()
+
+    >>> # Initializing a CLIPSegModel (with random weights) from the organization/clipseg-rd64-uni style configuration
+    >>> model = CLIPSegModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a CLIPSegConfig from a CLIPSegTextConfig and a CLIPSegVisionConfig
+
+    >>> # Initializing a CLIPSegText and CLIPSegVision configuration
+    >>> config_text = CLIPSegTextConfig()
+    >>> config_vision = CLIPSegVisionConfig()
+
+    >>> config = CLIPSegConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "clipseg"
+    is_composition = True
+
+    def __init__(
+        self,
+        text_config_dict=None,
+        vision_config_dict=None,
+        projection_dim=512,
+        logit_scale_init_value=2.6592,
+        **kwargs
+    ):
+        super().__init__(text_config_dict=text_config_dict, vision_config_dict=vision_config_dict, **kwargs)
+
+        if text_config_dict is None:
+            text_config_dict = {}
+            logger.info("text_config_dict is None. Initializing the CLIPSegTextConfig with default values.")
+
+        if vision_config_dict is None:
+            vision_config_dict = {}
+            logger.info("vision_config_dict is None. initializing the CLIPSegVisionConfig with default values.")
+
+        self.text_config = CLIPSegTextConfig(**text_config_dict)
+        self.vision_config = CLIPSegVisionConfig(**vision_config_dict)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: CLIPSegTextConfig, vision_config: CLIPSegVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`CLIPSegConfig`] (or a derived class) from clipseg text model configuration and clipseg vision model
+        configuration.
+
+        Returns:
+            [`CLIPSegConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config_dict=text_config.to_dict(), vision_config_dict=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+
+
+class CLIPSegOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("logits_per_image", {0: "batch"}),
+                ("logits_per_text", {0: "batch"}),
+                ("text_embeds", {0: "batch"}),
+                ("image_embeds", {0: "batch"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+    def generate_dummy_inputs(
+        self,
+        processor: "ProcessorMixin",
+        batch_size: int = -1,
+        seq_length: int = -1,
+        framework: Optional["TensorType"] = None,
+    ) -> Mapping[str, Any]:
+
+        text_input_dict = super().generate_dummy_inputs(
+            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
+        )
+        image_input_dict = super().generate_dummy_inputs(
+            processor.feature_extractor, batch_size=batch_size, framework=framework
+        )
+        return {**text_input_dict, **image_input_dict}
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 14
diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
new file mode 100644
index 0000000000000..95c9a7650cd63
--- /dev/null
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -0,0 +1,148 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import torch
+
+from clipseg import load
+from transformers import CLIPSegConfig, CLIPSegModel
+
+
+def copy_attn_layer(hf_attn_layer, pt_attn_layer):
+    q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
+    q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
+
+    out_proj_weights = pt_attn_layer.out_proj.weight
+    out_proj_bias = pt_attn_layer.out_proj.bias
+
+    hf_attn_layer.q_proj.weight.data = q_proj
+    hf_attn_layer.q_proj.bias.data = q_proj_bias
+
+    hf_attn_layer.k_proj.weight.data = k_proj
+    hf_attn_layer.k_proj.bias.data = k_proj_bias
+
+    hf_attn_layer.v_proj.weight.data = v_proj
+    hf_attn_layer.v_proj.bias.data = v_proj_bias
+
+    hf_attn_layer.out_proj.weight = out_proj_weights
+    hf_attn_layer.out_proj.bias = out_proj_bias
+
+
+def copy_mlp(hf_mlp, pt_mlp):
+    copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
+    copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
+
+
+def copy_linear(hf_linear, pt_linear):
+    hf_linear.weight = pt_linear.weight
+    hf_linear.bias = pt_linear.bias
+
+
+def copy_layer(hf_layer, pt_layer):
+    # copy layer norms
+    copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
+    copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
+
+    # copy MLP
+    copy_mlp(hf_layer.mlp, pt_layer.mlp)
+
+    # copy attn
+    copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
+
+
+def copy_layers(hf_layers, pt_layers):
+    for hf_layer, pt_layer in zip(hf_layers, pt_layers):
+        copy_layer(hf_layer, pt_layer)
+
+
+def copy_encoder(hf_encoder, pt_model):
+    # copy  embeds
+    hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
+    hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding
+
+    # copy layer norm
+    copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
+
+    # copy hidden layers
+    copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
+
+
+def copy_text_model_and_projection(hf_model, pt_model):
+    # copy projection
+    hf_model.text_projection.weight.data = pt_model.text_projection.data.T
+
+    # copy text encoder
+    copy_encoder(hf_model.text_model, pt_model)
+
+
+def copy_vison_model_and_projection(hf_model, pt_model):
+    # copy projection
+    hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T
+
+    # copy layer norms
+    copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)
+    copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
+
+    # copy embeds
+    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
+    hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
+    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data
+
+    # copy encoder
+    copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
+
+
+@torch.no_grad()
+def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = CLIPSegConfig.from_pretrained(config_path)
+    else:
+        config = CLIPSegConfig(projection_dim=512, text_config={}, vision_config={})
+
+    hf_model = CLIPSegModel(config).eval()
+
+    pt_model, _ = load(checkpoint_path, device="cpu", jit=False)
+    pt_model = pt_model.eval()
+
+    copy_text_model_and_projection(hf_model, pt_model)
+    copy_vison_model_and_projection(hf_model, pt_model)
+    hf_model.logit_scale = pt_model.logit_scale
+
+    input_ids = torch.arange(0, 77).unsqueeze(0)
+    pixel_values = torch.randn(1, 3, 224, 224)
+
+    hf_logits_per_image, hf_logits_per_text = hf_model(
+        input_ids=input_ids, pixel_values=pixel_values, return_dict=True
+    )[1:3]
+    pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)
+
+    assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
+    assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3)
+
+    hf_model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    args = parser.parse_args()
+
+    convert_clipseg_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
new file mode 100644
index 0000000000000..1e255c1d28ce3
--- /dev/null
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -0,0 +1,1086 @@
+# coding=utf-8
+# Copyright 2022 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch CLIPSeg model."""
+
+
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_clipseg import CLIPSegConfig, CLIPSegTextConfig, CLIPSegVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "organization/clipseg-rd64-uni"
+
+CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "organization/clipseg-rd64-uni",
+    # See all CLIPSeg models at https://huggingface.co/models?filter=clipseg
+]
+
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIPSeg.html
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
+
+
+# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->clipseg
+def clipseg_loss(similarity: torch.Tensor) -> torch.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.t())
+    return (caption_loss + image_loss) / 2.0
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->CLIPSeg
+class CLIPSegOutput(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
+        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`CLIPSegTextModel`].
+        vision_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`CLIPSegVisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->CLIPSeg
+class CLIPSegVisionEmbeddings(nn.Module):
+    def __init__(self, config: CLIPSegVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size, bias=False
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->CLIPSeg
+class CLIPSegTextEmbeddings(nn.Module):
+    def __init__(self, config: CLIPSegTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->CLIPSeg
+class CLIPSegAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->CLIPSeg
+class CLIPSegMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->CLIPSeg
+class CLIPSegEncoderLayer(nn.Module):
+    def __init__(self, config: CLIPSegConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPSegAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.mlp = CLIPSegMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPPreTrainedModel with CLIP->CLIPSeg,clip->clipseg
+class CLIPSegPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = CLIPSegConfig
+    base_model_prefix = "clipseg"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, CLIPSegTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, CLIPSegVisionEmbeddings):
+            factor = self.config.initializer_factor
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, CLIPSegAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, CLIPSegMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, CLIPSegModel):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLIPSegEncoder):
+            module.gradient_checkpointing = value
+
+
+CLIPSEG_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`CLIPSegConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLIPSEG_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIPSEG_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIPSEG_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->CLIPSeg
+class CLIPSegEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPSegEncoderLayer`].
+
+    Args:
+        config: CLIPSegConfig
+    """
+
+    def __init__(self, config: CLIPSegConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([CLIPSegEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class CLIPSegTextTransformer(nn.Module):
+    def __init__(self, config: CLIPSegTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPSegTextEmbeddings(config)
+        self.encoder = CLIPSegEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    @add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify either input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        bsz, seq_len = input_shape
+        # CLIPSEG's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIPSEG/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clipseg/model.py#L324
+        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
+            hidden_states.device
+        )
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=input_ids.device), input_ids.to(torch.int).argmax(dim=-1)
+        ]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
+        mask.fill_(torch.tensor(torch.finfo(dtype).min))
+        mask.triu_(1)  # zero out the lower diagonal
+        mask = mask.unsqueeze(1)  # expand mask
+        return mask
+
+
+class CLIPSegTextModel(CLIPSegPreTrainedModel):
+    config_class = CLIPSegTextConfig
+
+    _no_split_modules = ["CLIPSegEncoderLayer"]
+
+    def __init__(self, config: CLIPSegTextConfig):
+        super().__init__(config)
+        self.text_model = CLIPSegTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import CLIPTokenizer, CLIPSegTextModel
+
+        >>> model = CLIPSegTextModel.from_pretrained("organization/clipseg-rd64-uni")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("organization/clipseg-rd64-uni")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLIPSegVisionTransformer(nn.Module):
+    def __init__(self, config: CLIPSegVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPSegVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim)
+        self.encoder = CLIPSegEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim)
+
+    @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class CLIPSegVisionModel(CLIPSegPreTrainedModel):
+    config_class = CLIPSegVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: CLIPSegVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLIPSegVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import CLIPProcessor, CLIPSegVisionModel
+
+        >>> model = CLIPSegVisionModel.from_pretrained("organization/clipseg-rd64-uni")
+        >>> processor = CLIPProcessor.from_pretrained("organization/clipseg-rd64-uni")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(CLIPSEG_START_DOCSTRING)
+class CLIPSegModel(CLIPSegPreTrainedModel):
+    config_class = CLIPSegConfig
+
+    def __init__(self, config: CLIPSegConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, CLIPSegTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type CLIPSegTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, CLIPSegVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type CLIPSegVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = CLIPSegTextTransformer(text_config)
+        self.vision_model = CLIPSegVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPSegTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import CLIPTokenizer, CLIPSegModel
+
+        >>> model = CLIPSegModel.from_pretrained("organization/clipseg-rd64-uni")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("organization/clipseg-rd64-uni")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use CLIPSEG model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import CLIPProcessor, CLIPSegModel
+
+        >>> model = CLIPSegModel.from_pretrained("organization/clipseg-rd64-uni")
+        >>> processor = CLIPProcessor.from_pretrained("organization/clipseg-rd64-uni")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use CLIPSEG model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    @add_start_docstrings_to_model_forward(CLIPSEG_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLIPSegOutput, config_class=CLIPSegConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPSegOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import CLIPProcessor, CLIPSegModel
+
+        >>> model = CLIPSegModel.from_pretrained("organization/clipseg-rd64-uni")
+        >>> processor = CLIPProcessor.from_pretrained("organization/clipseg-rd64-uni")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLIPSEG model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = clipseg_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return CLIPSegOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
diff --git a/tests/models/clipseg/__init__.py b/tests/models/clipseg/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
new file mode 100644
index 0000000000000..271057c1865c4
--- /dev/null
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -0,0 +1,674 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch CLIPSeg model. """
+
+
+import inspect
+import os
+import tempfile
+import unittest
+
+import numpy as np
+
+import requests
+import transformers
+from transformers import CLIPSegConfig, CLIPSegTextConfig, CLIPSegVisionConfig
+from transformers.testing_utils import (
+    is_flax_available,
+    is_pt_flax_cross_test,
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import CLIPSegModel, CLIPSegTextModel, CLIPSegVisionModel
+    from transformers.models.clipseg.modeling_clipseg import CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import CLIPProcessor
+
+
+if is_flax_available():
+    import jax.numpy as jnp
+    from transformers.modeling_flax_pytorch_utils import (
+        convert_pytorch_state_dict_to_flax,
+        load_flax_weights_in_pytorch_model,
+    )
+
+
+class CLIPSegVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return CLIPSegVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = CLIPSegVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class CLIPSegVisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as CLIPSEG does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (CLIPSegVisionModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = CLIPSegVisionModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CLIPSegVisionConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="CLIPSEG does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="CLIPSegVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="CLIPSegVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = CLIPSegVisionModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class CLIPSegTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask
+
+    def get_config(self):
+        return CLIPSegTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = CLIPSegTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class CLIPSegTextModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (CLIPSegTextModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = CLIPSegTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CLIPSegTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="CLIPSEG does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="CLIPSegTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="CLIPSegTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = CLIPSegTextModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class CLIPSegModelTester:
+    def __init__(self, parent, is_training=True):
+        self.parent = parent
+        self.text_model_tester = CLIPSegTextModelTester(parent)
+        self.vision_model_tester = CLIPSegVisionModelTester(parent)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return CLIPSegConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = CLIPSegModel(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values, attention_mask)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class CLIPSegModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (CLIPSegModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+
+    def setUp(self):
+        self.model_tester = CLIPSegModelTester(self)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="CLIPSegModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    # override as the `logit_scale` parameter initilization is different for CLIPSEG
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # check if `logit_scale` is initilized as per the original implementation
+                    if name == "logit_scale":
+                        self.assertAlmostEqual(
+                            param.data.item(),
+                            np.log(1 / 0.07),
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+
+            try:
+                input_ids = inputs_dict["input_ids"]
+                pixel_values = inputs_dict["pixel_values"]  # CLIPSEG needs pixel_values
+                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save CLIPSegConfig and check if we can load CLIPSegVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = CLIPSegVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save CLIPSegConfig and check if we can load CLIPSegTextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = CLIPSegTextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+    # overwrite from common since FlaxCLIPSegModel returns nested output
+    # which is not supported in the common test
+    @is_pt_flax_cross_test
+    def test_equivalence_pt_to_flax(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+
+                # load PyTorch class
+                pt_model = model_class(config).eval()
+                # Flax models don't use the `use_cache` option and cache is not returned as a default.
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
+
+                fx_model_class_name = "Flax" + model_class.__name__
+
+                if not hasattr(transformers, fx_model_class_name):
+                    return
+
+                fx_model_class = getattr(transformers, fx_model_class_name)
+
+                # load Flax class
+                fx_model = fx_model_class(config, dtype=jnp.float32)
+                # make sure only flax inputs are forward that actually exist in function args
+                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
+
+                # prepare inputs
+                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                # remove function args that don't exist in Flax
+                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
+
+                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
+                fx_model.params = fx_state
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                # convert inputs to Flax
+                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_outputs = fx_model(**fx_inputs).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    pt_model.save_pretrained(tmpdirname)
+                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, from_pt=True)
+
+                fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple()
+                self.assertEqual(
+                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
+                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2)
+
+    # overwrite from common since FlaxCLIPSegModel returns nested output
+    # which is not supported in the common test
+    @is_pt_flax_cross_test
+    def test_equivalence_flax_to_pt(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                # load corresponding PyTorch class
+                pt_model = model_class(config).eval()
+
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
+
+                fx_model_class_name = "Flax" + model_class.__name__
+
+                if not hasattr(transformers, fx_model_class_name):
+                    # no flax model exists for this class
+                    return
+
+                fx_model_class = getattr(transformers, fx_model_class_name)
+
+                # load Flax class
+                fx_model = fx_model_class(config, dtype=jnp.float32)
+                # make sure only flax inputs are forward that actually exist in function args
+                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
+
+                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
+
+                # make sure weights are tied in PyTorch
+                pt_model.tie_weights()
+
+                # prepare inputs
+                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                # remove function args that don't exist in Flax
+                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+
+                fx_outputs = fx_model(**fx_inputs).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+
+                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    fx_model.save_pretrained(tmpdirname)
+                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
+
+                with torch.no_grad():
+                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
+
+                self.assertEqual(
+                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = CLIPSegModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@require_vision
+@require_torch
+class CLIPSegModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "organization/clipseg-rd64-uni"
+        model = CLIPSegModel.from_pretrained(model_name).to(torch_device)
+        processor = CLIPProcessor.from_pretrained(model_name)
+
+        image = prepare_img()
+        inputs = processor(
+            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt"
+        ).to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        self.assertEqual(
+            outputs.logits_per_image.shape,
+            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+        )
+        self.assertEqual(
+            outputs.logits_per_text.shape,
+            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+        )
+
+        expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))

From b7dd5adfa343cd488c288e4f1c8ede3d643f80c7 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 23 Oct 2022 13:40:01 +0200
Subject: [PATCH 02/47] Update conversion script

---
 README.md                                     |   1 +
 README_es.md                                  |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/index.mdx                      |   2 +
 docs/source/en/model_doc/clipseg.mdx          |   5 +
 docs/source/en/serialization.mdx              |   1 +
 src/transformers/__init__.py                  |   6 +-
 src/transformers/models/clipseg/__init__.py   |   8 +-
 .../models/clipseg/configuration_clipseg.py   |  25 +-
 .../convert_clipseg_original_pytorch_to_hf.py | 236 +++++++-----------
 .../models/clipseg/modeling_clipseg.py        |  22 +-
 src/transformers/models/clipseg/test.py       |   6 +
 src/transformers/utils/dummy_pt_objects.py    |  38 +++
 tests/models/clipseg/test_modeling_clipseg.py |  15 +-
 utils/check_repo.py                           |   3 +
 17 files changed, 206 insertions(+), 166 deletions(-)
 create mode 100644 src/transformers/models/clipseg/test.py

diff --git a/README.md b/README.md
index 4aa7e6c882ea4..07eec4d8a1b1d 100644
--- a/README.md
+++ b/README.md
@@ -279,6 +279,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
diff --git a/README_es.md b/README_es.md
index c08ec500892d5..579b2da1b7cc7 100644
--- a/README_es.md
+++ b/README_es.md
@@ -279,6 +279,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
diff --git a/README_ko.md b/README_ko.md
index 28a2e2aa46434..30056ff35a6fd 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -229,6 +229,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 7f877c2bed209..891245e343a59 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -253,6 +253,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
+1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 发布。
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index e5764c6ce8f15..b65177be21f7a 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -265,6 +265,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index bcc832a250ded..2a3243787c159 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -67,6 +67,7 @@ The documentation is organized into five sections:
 1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSegSeg](model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
@@ -223,6 +224,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|         CLIPSegSeg          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |      Conditional DETR       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/en/model_doc/clipseg.mdx b/docs/source/en/model_doc/clipseg.mdx
index 942de42d86703..5cb283784d2d4 100644
--- a/docs/source/en/model_doc/clipseg.mdx
+++ b/docs/source/en/model_doc/clipseg.mdx
@@ -58,3 +58,8 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 [[autodoc]] CLIPSegVisionModel
     - forward
+
+## CLIPSegForImageSegmentation
+
+[[autodoc]] CLIPSegForImageSegmentation
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 1cbc1237f286b..14557218b820c 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -56,6 +56,7 @@ Ready-made configurations include the following architectures:
 - BLOOM
 - CamemBERT
 - CLIP
+- CLIPSegSeg
 - CodeGen
 - Conditional DETR
 - ConvBERT
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8de4f2c9da284..17932859a6a98 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -174,9 +174,7 @@
     "models.clipseg": [
         "CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "CLIPSegConfig",
-       
         "CLIPSegTextConfig",
-       
         "CLIPSegVisionConfig",
     ],
     "models.codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenTokenizer"],
@@ -1089,6 +1087,7 @@
             "CLIPSegPreTrainedModel",
             "CLIPSegTextModel",
             "CLIPSegVisionModel",
+            "CLIPSegForImageSegmentation",
         ]
     )
     _import_structure["models.x_clip"].extend(
@@ -3245,9 +3244,7 @@
     from .models.clipseg import (
         CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CLIPSegConfig,
-       
         CLIPSegTextConfig,
-       
         CLIPSegVisionConfig,
     )
     from .models.codegen import CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP, CodeGenConfig, CodeGenTokenizer
@@ -4020,6 +4017,7 @@
         )
         from .models.clipseg import (
             CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST,
+            CLIPSegForImageSegmentation,
             CLIPSegModel,
             CLIPSegPreTrainedModel,
             CLIPSegTextModel,
diff --git a/src/transformers/models/clipseg/__init__.py b/src/transformers/models/clipseg/__init__.py
index 2e1f9bd2e6423..2e277f1385528 100644
--- a/src/transformers/models/clipseg/__init__.py
+++ b/src/transformers/models/clipseg/__init__.py
@@ -17,11 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-)
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
@@ -46,6 +42,7 @@
         "CLIPSegPreTrainedModel",
         "CLIPSegTextModel",
         "CLIPSegVisionModel",
+        "CLIPSegForImageSegmentation",
     ]
 
 if TYPE_CHECKING:
@@ -65,6 +62,7 @@
     else:
         from .modeling_clipseg import (
             CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST,
+            CLIPSegForImageSegmentation,
             CLIPSegModel,
             CLIPSegPreTrainedModel,
             CLIPSegTextModel,
diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index 0453de57ebf32..b6220f4d69226 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -36,12 +36,11 @@
 }
 
 
-
 class CLIPSegTextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an CLIPSeg
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the CLIPSeg
+    This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an
+    CLIPSeg model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the CLIPSeg
     [organization/clipseg-rd64-uni](https://huggingface.co/organization/clipseg-rd64-uni) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -50,8 +49,8 @@ class CLIPSegTextConfig(PretrainedConfig):
 
     Args:
         vocab_size (`int`, *optional*, defaults to 49408):
-            Vocabulary size of the CLIPSeg text model. Defines the number of different tokens that can be represented by
-            the `inputs_ids` passed when calling [`CLIPSegModel`].
+            Vocabulary size of the CLIPSeg text model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`CLIPSegModel`].
         hidden_size (`int`, *optional*, defaults to 512):
             Dimensionality of the encoder layers and the pooler layer.
         intermediate_size (`int`, *optional*, defaults to 2048):
@@ -147,9 +146,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 
 class CLIPSegVisionConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an CLIPSeg
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the CLIPSeg
+    This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an
+    CLIPSeg model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the CLIPSeg
     [organization/clipseg-rd64-uni](https://huggingface.co/organization/clipseg-rd64-uni) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -253,8 +252,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 
 class CLIPSegConfig(PretrainedConfig):
     r"""
-    [`CLIPSegConfig`] is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate
-    CLIPSeg model according to the specified arguments, defining the text model and vision model configs.
+    [`CLIPSegConfig`] is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to
+    instantiate CLIPSeg model according to the specified arguments, defining the text model and vision model configs.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -325,8 +324,8 @@ def __init__(
     @classmethod
     def from_text_vision_configs(cls, text_config: CLIPSegTextConfig, vision_config: CLIPSegVisionConfig, **kwargs):
         r"""
-        Instantiate a [`CLIPSegConfig`] (or a derived class) from clipseg text model configuration and clipseg vision model
-        configuration.
+        Instantiate a [`CLIPSegConfig`] (or a derived class) from clipseg text model configuration and clipseg vision
+        model configuration.
 
         Returns:
             [`CLIPSegConfig`]: An instance of a configuration object
diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 95c9a7650cd63..88425b0d01940 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -1,148 +1,106 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import argparse
 
 import torch
 
-from clipseg import load
-from transformers import CLIPSegConfig, CLIPSegModel
-
-
-def copy_attn_layer(hf_attn_layer, pt_attn_layer):
-    q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
-    q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
-
-    out_proj_weights = pt_attn_layer.out_proj.weight
-    out_proj_bias = pt_attn_layer.out_proj.bias
-
-    hf_attn_layer.q_proj.weight.data = q_proj
-    hf_attn_layer.q_proj.bias.data = q_proj_bias
-
-    hf_attn_layer.k_proj.weight.data = k_proj
-    hf_attn_layer.k_proj.bias.data = k_proj_bias
-
-    hf_attn_layer.v_proj.weight.data = v_proj
-    hf_attn_layer.v_proj.bias.data = v_proj_bias
-
-    hf_attn_layer.out_proj.weight = out_proj_weights
-    hf_attn_layer.out_proj.bias = out_proj_bias
-
-
-def copy_mlp(hf_mlp, pt_mlp):
-    copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
-    copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
-
-
-def copy_linear(hf_linear, pt_linear):
-    hf_linear.weight = pt_linear.weight
-    hf_linear.bias = pt_linear.bias
-
-
-def copy_layer(hf_layer, pt_layer):
-    # copy layer norms
-    copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
-    copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
-
-    # copy MLP
-    copy_mlp(hf_layer.mlp, pt_layer.mlp)
-
-    # copy attn
-    copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
-
-
-def copy_layers(hf_layers, pt_layers):
-    for hf_layer, pt_layer in zip(hf_layers, pt_layers):
-        copy_layer(hf_layer, pt_layer)
-
-
-def copy_encoder(hf_encoder, pt_model):
-    # copy  embeds
-    hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
-    hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding
-
-    # copy layer norm
-    copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
-
-    # copy hidden layers
-    copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
-
-
-def copy_text_model_and_projection(hf_model, pt_model):
-    # copy projection
-    hf_model.text_projection.weight.data = pt_model.text_projection.data.T
-
-    # copy text encoder
-    copy_encoder(hf_model.text_model, pt_model)
-
-
-def copy_vison_model_and_projection(hf_model, pt_model):
-    # copy projection
-    hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T
-
-    # copy layer norms
-    copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)
-    copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
-
-    # copy embeds
-    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
-    hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
-    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data
-
-    # copy encoder
-    copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
-
-
-@torch.no_grad()
-def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = CLIPSegConfig.from_pretrained(config_path)
-    else:
-        config = CLIPSegConfig(projection_dim=512, text_config={}, vision_config={})
-
-    hf_model = CLIPSegModel(config).eval()
-
-    pt_model, _ = load(checkpoint_path, device="cpu", jit=False)
-    pt_model = pt_model.eval()
-
-    copy_text_model_and_projection(hf_model, pt_model)
-    copy_vison_model_and_projection(hf_model, pt_model)
-    hf_model.logit_scale = pt_model.logit_scale
-
-    input_ids = torch.arange(0, 77).unsqueeze(0)
-    pixel_values = torch.randn(1, 3, 224, 224)
-
-    hf_logits_per_image, hf_logits_per_text = hf_model(
-        input_ids=input_ids, pixel_values=pixel_values, return_dict=True
-    )[1:3]
-    pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)
-
-    assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
-    assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3)
-
-    hf_model.save_pretrained(pytorch_dump_folder_path)
+from transformers import CLIPSegConfig, CLIPSegForImageSegmentation
+
+
+def get_clipseg_config():
+    config = CLIPSegConfig()
+    return config
+
+
+def rename_key(name):
+    # update prefixes
+    if "clip_model" in name:
+        name = name.replace("clip_model", "clipseg")
+    if "transformer" in name:
+        if "visual" in name:
+            name = name.replace("visual.transformer", "vision_model")
+        else:
+            name = name.replace("transformer", "text_model")
+    if "resblocks" in name:
+        name = name.replace("resblocks", "encoder.layers")
+    if "ln_1" in name:
+        name = name.replace("ln_1", "layer_norm1")
+    if "ln_2" in name:
+        name = name.replace("ln_2", "layer_norm2")
+    if "mlp.fc1" in name:
+        name = name.replace("mlp.fc1", "intermediate.dense")
+    if "mlp.fc2" in name:
+        name = name.replace("mlp.fc2", "output.dense")
+    if "ln_final" in name:
+        name = name.replace("ln_final", "final_layer_norm")
+    # text encoder
+    if "token_embedding" in name:
+        name = name.replace("token_embedding", "text_model.embeddings.token_embedding")
+    if "positional_embedding" in name:
+        name = name.replace("positional_embedding", "text_model.embeddings.token_embedding.weight")
+    # vision encoder
+    if "visual.class_embedding" in name:
+        name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding")
+    if "visual.positional_embedding" in name:
+        name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding")
+    if "ln_pre" in name:
+        name = name.replace("ln_pre", "pre_layrnorm")
+
+    return name
+
+
+def convert_state_dict(orig_state_dict, config):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if "attn" in key:
+            # TODO
+            pass
+        else:
+            orig_state_dict[rename_key(key)] = val
+
+    return orig_state_dict
+
+
+def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path):
+    config = get_clipseg_config()
+    model = CLIPSegForImageSegmentation(config)
+    model.eval()
+
+    state_dict = torch.load(checkpoint_path, map_location="cpu")
+    state_dict = convert_state_dict(state_dict, config)
+    model.load_state_dict(state_dict)
+
+    # TODO assert values
+    # url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+
+    # feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(model_name.replace("_", "-")))
+    # image = Image.open(requests.get(url, stream=True).raw)
+    # inputs = feature_extractor(images=image, return_tensors="pt")
+
+    # timm_outs = timm_model(inputs["pixel_values"])
+    # hf_outs = model(**inputs).logits
+
+    # assert torch.allclose(timm_outs, hf_outs, atol=1e-3)
+
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+
+        # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+        # feature_extractor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    args = parser.parse_args()
+    # Required parameters
+    parser.add_argument(
+        "--checkpoint_path",
+        default="/Users/nielsrogge/Downloads/clipseg_weights/rd64-uni.pth",
+        type=str,
+        help="Path to the original checkpoint.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
 
-    convert_clipseg_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
+    args = parser.parse_args()
+    convert_clipseg_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 1e255c1d28ce3..f3f690800bba4 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -45,7 +45,6 @@
 ]
 
 
-
 # Copied from transformers.models.bart.modeling_bart._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
@@ -90,7 +89,8 @@ class CLIPSegOutput(ModelOutput):
         text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
             The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
         image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
+            The image embeddings obtained by applying the projection layer to the pooled output of
+            [`CLIPSegVisionModel`].
         text_model_output(`BaseModelOutputWithPooling`):
             The output of the [`CLIPSegTextModel`].
         vision_model_output(`BaseModelOutputWithPooling`):
@@ -1084,3 +1084,21 @@ def forward(
             text_model_output=text_outputs,
             vision_model_output=vision_outputs,
         )
+
+
+class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel):
+    config_class = CLIPSegConfig
+
+    def __init__(self, config: CLIPSegConfig):
+        super().__init__(config)
+
+        # TODO perhaps use clip here?
+        self.clipseg = CLIPSegModel(config)
+
+        # TODO: decoder
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward():
+        raise NotImplementedError("To do")
diff --git a/src/transformers/models/clipseg/test.py b/src/transformers/models/clipseg/test.py
new file mode 100644
index 0000000000000..c83fc1611d3bd
--- /dev/null
+++ b/src/transformers/models/clipseg/test.py
@@ -0,0 +1,6 @@
+from transformers import CLIPSegConfig, CLIPSegForImageSegmentation
+
+model = CLIPSegForImageSegmentation(CLIPSegConfig())
+
+for name, param in model.named_parameters():
+    print(name, param.shape)
\ No newline at end of file
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index cb2f93be0fc90..755f1af0a665c 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1207,6 +1207,44 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class CLIPSegForImageSegmentation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CLIPSegModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CLIPSegPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CLIPSegTextModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CLIPSegVisionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 271057c1865c4..27dfbbb2f5aeb 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -49,7 +49,7 @@
     import torch
     from torch import nn
 
-    from transformers import CLIPSegModel, CLIPSegTextModel, CLIPSegVisionModel
+    from transformers import CLIPSegForImageSegmentation, CLIPSegModel, CLIPSegTextModel, CLIPSegVisionModel
     from transformers.models.clipseg.modeling_clipseg import CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
@@ -159,7 +159,9 @@ class CLIPSegVisionModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = CLIPSegVisionModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CLIPSegVisionConfig, has_text_modality=False, hidden_size=37)
+        self.config_tester = ConfigTester(
+            self, config_class=CLIPSegVisionConfig, has_text_modality=False, hidden_size=37
+        )
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -388,7 +390,14 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class CLIPSegModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (CLIPSegModel,) if is_torch_available() else ()
+    all_model_classes = (
+        (
+            CLIPSegModel,
+            CLIPSegForImageSegmentation,
+        )
+        if is_torch_available()
+        else ()
+    )
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 4b7ec38e80799..04ebba0dfa4ae 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -140,6 +140,9 @@
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     # models to ignore for model xxx mapping
+    "CLIPSegForImageSegmentation",
+    "CLIPSegVisionModel",
+    "CLIPSegTextModel",
     "EsmForProteinFolding",
     "TimeSeriesTransformerForPrediction",
     "PegasusXEncoder",

From 8d03901392018291b4dbd428688835f226512452 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 23 Oct 2022 14:53:58 +0200
Subject: [PATCH 03/47] Improve conversion script

---
 .../convert_clipseg_original_pytorch_to_hf.py | 49 ++++++++++++++++---
 src/transformers/models/clipseg/test.py       |  3 +-
 2 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 88425b0d01940..80e0dc18a1eef 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -25,10 +25,12 @@ def rename_key(name):
         name = name.replace("ln_1", "layer_norm1")
     if "ln_2" in name:
         name = name.replace("ln_2", "layer_norm2")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
+    if "c_fc" in name:
+        name = name.replace("c_fc", "fc1")
+    if "c_proj" in name:
+        name = name.replace("c_proj", "fc2")
+    if "attn" in name:
+        name = name.replace("attn", "self_attn")
     if "ln_final" in name:
         name = name.replace("ln_final", "final_layer_norm")
     # text encoder
@@ -43,6 +45,8 @@ def rename_key(name):
         name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding")
     if "ln_pre" in name:
         name = name.replace("ln_pre", "pre_layrnorm")
+    if "ln_post" in name:
+        name = name.replace("ln_post", "post_layernorm")
 
     return name
 
@@ -51,9 +55,29 @@ def convert_state_dict(orig_state_dict, config):
     for key in orig_state_dict.copy().keys():
         val = orig_state_dict.pop(key)
 
-        if "attn" in key:
-            # TODO
-            pass
+        if key.startswith("clip_model") and "attn.in_proj" in key:
+            key_split = key.split(".")
+            if "visual" in key:
+                layer_num = int(key_split[4])
+                dim = config.vision_config.hidden_size
+                prefix = "vision_model"
+            else:
+                layer_num = int(key_split[3])
+                dim = config.text_config.hidden_size
+                prefix = "text_model"
+
+            if "weight" in key:
+                orig_state_dict[f"clipseg.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
+                orig_state_dict[f"clipseg.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
+                    dim : dim * 2, :
+                ]
+                orig_state_dict[f"clipseg.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
+            else:
+                orig_state_dict[f"clipseg.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
+                orig_state_dict[f"clipseg.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[
+                    dim : dim * 2
+                ]
+                orig_state_dict[f"clipseg.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
         else:
             orig_state_dict[rename_key(key)] = val
 
@@ -66,6 +90,15 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path):
     model.eval()
 
     state_dict = torch.load(checkpoint_path, map_location="cpu")
+
+    for key in state_dict.copy().keys():
+        if key.startswith("model"):
+            state_dict.pop(key, None)
+
+    print("ORIGINAL STATE DICT")
+    for name, param in state_dict.items():
+        print(name, param.shape)
+
     state_dict = convert_state_dict(state_dict, config)
     model.load_state_dict(state_dict)
 
@@ -94,7 +127,7 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path):
     # Required parameters
     parser.add_argument(
         "--checkpoint_path",
-        default="/Users/nielsrogge/Downloads/clipseg_weights/rd64-uni.pth",
+        default="/Users/nielsrogge/Documents/CLIPSeg/test.pth",
         type=str,
         help="Path to the original checkpoint.",
     )
diff --git a/src/transformers/models/clipseg/test.py b/src/transformers/models/clipseg/test.py
index c83fc1611d3bd..810eb39444435 100644
--- a/src/transformers/models/clipseg/test.py
+++ b/src/transformers/models/clipseg/test.py
@@ -1,6 +1,7 @@
 from transformers import CLIPSegConfig, CLIPSegForImageSegmentation
 
+
 model = CLIPSegForImageSegmentation(CLIPSegConfig())
 
 for name, param in model.named_parameters():
-    print(name, param.shape)
\ No newline at end of file
+    print(name, param.shape)

From 1053fae56b380016ab5798498e7656cefc501531 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 23 Oct 2022 15:14:30 +0200
Subject: [PATCH 04/47] Improve conversion script some more

---
 .../convert_clipseg_original_pytorch_to_hf.py | 31 +++++++++++++------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 80e0dc18a1eef..0637b2da77ce4 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -7,6 +7,7 @@
 
 def get_clipseg_config():
     config = CLIPSegConfig()
+    config.vision_config.patch_size = 16
     return config
 
 
@@ -31,22 +32,29 @@ def rename_key(name):
         name = name.replace("c_proj", "fc2")
     if "attn" in name:
         name = name.replace("attn", "self_attn")
-    if "ln_final" in name:
-        name = name.replace("ln_final", "final_layer_norm")
     # text encoder
     if "token_embedding" in name:
         name = name.replace("token_embedding", "text_model.embeddings.token_embedding")
-    if "positional_embedding" in name:
-        name = name.replace("positional_embedding", "text_model.embeddings.token_embedding.weight")
+    if "positional_embedding" in name and "visual" not in name:
+        name = name.replace("positional_embedding", "text_model.embeddings.position_embedding.weight")
+    if "ln_final" in name:
+        name = name.replace("ln_final", "text_model.final_layer_norm")
     # vision encoder
     if "visual.class_embedding" in name:
         name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding")
+    if "visual.conv1" in name:
+        name = name.replace("visual.conv1", "vision_model.embeddings.patch_embedding")
     if "visual.positional_embedding" in name:
-        name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding")
-    if "ln_pre" in name:
-        name = name.replace("ln_pre", "pre_layrnorm")
-    if "ln_post" in name:
-        name = name.replace("ln_post", "post_layernorm")
+        name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding.weight")
+    if "visual.ln_pre" in name:
+        name = name.replace("visual.ln_pre", "vision_model.pre_layrnorm")
+    if "visual.ln_post" in name:
+        name = name.replace("visual.ln_post", "vision_model.post_layernorm")
+    # projection layers
+    if "visual.proj" in name:
+        name = name.replace("visual.proj", "visual_projection.weight")
+    if "text_projection" in name:
+        name = name.replace("text_projection", "text_projection.weight")
 
     return name
 
@@ -79,7 +87,10 @@ def convert_state_dict(orig_state_dict, config):
                 ]
                 orig_state_dict[f"clipseg.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
         else:
-            orig_state_dict[rename_key(key)] = val
+            new_name = rename_key(key)
+            if "visual_projection" in new_name or "text_projection" in new_name:
+                val = val.T
+            orig_state_dict[new_name] = val
 
     return orig_state_dict
 

From 801be5fbf8b28b97039ddf4705ac0b9fd8ae75c0 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 23 Oct 2022 16:11:04 +0200
Subject: [PATCH 05/47] Add conditional embeddings

---
 .../convert_clipseg_original_pytorch_to_hf.py | 35 +++++++--
 .../models/clipseg/modeling_clipseg.py        | 73 ++++++++++++++++++-
 2 files changed, 98 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 0637b2da77ce4..a358b6a20523d 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -1,6 +1,8 @@
 import argparse
 
 import torch
+from PIL import Image
+from torchvision.transforms import Compose, Resize, ToTensor
 
 from transformers import CLIPSegConfig, CLIPSegForImageSegmentation
 
@@ -95,6 +97,14 @@ def convert_state_dict(orig_state_dict, config):
     return orig_state_dict
 
 
+image_transforms = Compose(
+    [
+        ToTensor(),
+        Resize((224, 224)),
+    ]
+)
+
+
 def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path):
     config = get_clipseg_config()
     model = CLIPSegForImageSegmentation(config)
@@ -111,19 +121,28 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path):
         print(name, param.shape)
 
     state_dict = convert_state_dict(state_dict, config)
-    model.load_state_dict(state_dict)
+    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
 
-    # TODO assert values
-    # url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    print("Missing keys:", missing_keys)
+    print("Unexpected keys:", unexpected_keys)
 
+    # TODO create feature extractor
     # feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(model_name.replace("_", "-")))
-    # image = Image.open(requests.get(url, stream=True).raw)
-    # inputs = feature_extractor(images=image, return_tensors="pt")
+    image = Image.open("/Users/nielsrogge/Documents/cats.jpg").convert("RGB")
+    pixel_values = image_transforms(image).unsqueeze(0).repeat(4, 1, 1, 1)
+
+    # prompts = ["a glass", "something to fill", "wood", "a jar"]
+    # tokenizer = CLIPTokenizer.from_pretrained("openai/")
+    # input_ids = CLIPTokenizer(prompts, padding="max_length", return_tensors="pt")
+    input_ids = torch.tensor([[1, 2] + [9] * 75]).repeat(4, 1)
+
+    print("Shape of pixel values:", pixel_values.shape)
+    print("Shape of input ids:", input_ids.shape)
 
-    # timm_outs = timm_model(inputs["pixel_values"])
-    # hf_outs = model(**inputs).logits
+    outputs = model(input_ids, pixel_values)
+    print(outputs.keys())
 
-    # assert torch.allclose(timm_outs, hf_outs, atol=1e-3)
+    # assert torch.allclose(outputs, expected_slice, atol=1e-3)
 
     if pytorch_dump_folder_path is not None:
         print(f"Saving model to {pytorch_dump_folder_path}")
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index f3f690800bba4..b50fc6eda92bf 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -786,6 +786,9 @@ def forward(
         hidden_states = self.embeddings(pixel_values)
         hidden_states = self.pre_layrnorm(hidden_states)
 
+        print("Shape of hidden states before Transformer encoder:", hidden_states.shape)
+        print("First values of hidden states before Transformer encoder:", hidden_states[0, :3, :3])
+
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
             output_attentions=output_attentions,
@@ -797,6 +800,9 @@ def forward(
         pooled_output = last_hidden_state[:, 0, :]
         pooled_output = self.post_layernorm(pooled_output)
 
+        print("Shape of pooled output:", pooled_output.shape)
+        print("First values of pooled output:", pooled_output[0, :3])
+
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
 
@@ -1094,11 +1100,74 @@ def __init__(self, config: CLIPSegConfig):
 
         # TODO perhaps use clip here?
         self.clipseg = CLIPSegModel(config)
+        self.extract_layers = [3, 6, 9]
 
         # TODO: decoder
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward():
-        raise NotImplementedError("To do")
+    def get_conditional_embeddings(self, input_ids, conditional_pixel_values, batch_size):
+        # conditional can be either in the form of text, an image or an existing embedding tensor
+        # so either input_ids, pixel_values or existing embeddings
+
+        # # compute conditional from a single string
+        # if conditional is not None and type(conditional) == str:
+        #     cond = self.compute_conditional(conditional)
+        #     cond = cond.repeat(batch_size, 1)
+
+        # compute conditional from text
+        if input_ids is not None:
+            if len(input_ids) != batch_size:
+                raise ValueError("Make sure to pass as many texts as there are query images")
+            conditional_embeddings = self.clipseg.get_text_features(input_ids)
+        # compute conditional from image
+        elif conditional_pixel_values is not None:
+            with torch.no_grad():
+                conditional_embeddings = self.clipseg.get_image_features(conditional_pixel_values)
+        # TODO support the use conditional directly
+        # elif conditional is not None and type(conditional) == torch.Tensor and conditional.ndim == 2:
+        #     cond = conditional
+        else:
+            raise ValueError("invalid conditional")
+
+        return conditional_embeddings
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        conditional_pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPSegOutput]:
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the query images through the frozen CLIP vision encoder
+        with torch.no_grad():
+            vision_outputs = self.clipseg.vision_model(
+                pixel_values=pixel_values,
+                output_attentions=output_attentions,
+                output_hidden_states=True,  # we need the intermediate hidden states
+                return_dict=return_dict,
+            )
+            pooled_output = self.clipseg.visual_projection(vision_outputs[1])
+
+            # we add +1 here as the hidden states also include the initial embeddings
+            activations = [vision_outputs.hidden_states[i + 1] for i in [0] + self.extract_layers]
+
+        # step 2: compute conditional vector, either from text or images
+        conditional_embeddings = self.get_conditional_embeddings(
+            input_ids, conditional_pixel_values, batch_size=pixel_values.shape[0]
+        )
+
+        print("Shape of cond:", conditional_embeddings.shape)
+        print("First values of cond:", conditional_embeddings[0,:3])
+
+
+        return vision_outputs

From e0007f29332cf2a5ddba399de8cc78fa24cb99be Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 23 Oct 2022 16:23:12 +0200
Subject: [PATCH 06/47] Add initial decoder

---
 .../models/clipseg/configuration_clipseg.py   | 11 ++++++-
 .../models/clipseg/modeling_clipseg.py        | 29 +++++++++++++++++--
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index b6220f4d69226..552ed8e1a9034 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -17,6 +17,7 @@
 import copy
 import os
 from collections import OrderedDict
+from functools import reduce
 from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
 
 
@@ -264,9 +265,13 @@ class CLIPSegConfig(PretrainedConfig):
         vision_config_dict (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`CLIPSegVisionConfig`].
         projection_dim (`int`, *optional*, defaults to 512):
-            Dimentionality of text and vision projection layers.
+            Dimensionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
             The inital value of the *logit_scale* paramter. Default is used as per the original CLIPSeg implementation.
+        extract_layers (`List[int]`, *optional*, defaults to [3, 6, 9]):
+            Layers to extract when forwarding the query image through the frozen visual backbone of CLIP.
+        reduce_dim (`int`, *optional*, defaults to 128):
+            Dimensionality to reduce the CLIP vision embedding.
         kwargs (*optional*):
             Dictionary of keyword arguments.
 
@@ -302,6 +307,8 @@ def __init__(
         vision_config_dict=None,
         projection_dim=512,
         logit_scale_init_value=2.6592,
+        extract_layers=[3, 6, 9],
+        reduce_dim=128,
         **kwargs
     ):
         super().__init__(text_config_dict=text_config_dict, vision_config_dict=vision_config_dict, **kwargs)
@@ -319,6 +326,8 @@ def __init__(
 
         self.projection_dim = projection_dim
         self.logit_scale_init_value = logit_scale_init_value
+        self.reduce_dim = reduce_dim
+        self.extract_layers = extract_layers
         self.initializer_factor = 1.0
 
     @classmethod
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index b50fc6eda92bf..b9d9a4a9ffe8a 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -1092,6 +1092,29 @@ def forward(
         )
 
 
+class CLIPSegDecoder(CLIPSegPreTrainedModel):
+    def __init__(self, config: CLIPSegConfig):
+        super().__init__(config)
+
+        self.film_mul = nn.Linear(config.projection_dim, config.reduce_dim)
+        self.film_add = nn.Linear(config.projection_dim, config.reduce_dim)
+
+        self.reduce = nn.Linear(config.vision_config.hidden_size, config.reduce_dim)
+
+        self.transposed_convolution = nn.ConvTranspose2d(
+            config.reduce_dim, 1, config.vision_config.patch_size, stride=config.vision_config.patch_size
+        )
+
+        depth = len(config.extract_layers)
+        self.reduces = nn.ModuleList([nn.Linear(768, config.reduce_dim) for _ in range(depth)])
+        # self.blocks = nn.ModuleList(
+        #     [nn.TransformerEncoderLayer(d_model=reduce_dim, nhead=n_heads) for _ in range(len(self.extract_layers))]
+        # )
+
+    def forward(self, hidden_states):
+        return -1
+
+
 class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel):
     config_class = CLIPSegConfig
 
@@ -1100,9 +1123,10 @@ def __init__(self, config: CLIPSegConfig):
 
         # TODO perhaps use clip here?
         self.clipseg = CLIPSegModel(config)
-        self.extract_layers = [3, 6, 9]
+        self.extract_layers = config.extract_layers
 
         # TODO: decoder
+        self.decoder = CLIPSegDecoder(config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1167,7 +1191,6 @@ def forward(
         )
 
         print("Shape of cond:", conditional_embeddings.shape)
-        print("First values of cond:", conditional_embeddings[0,:3])
-
+        print("First values of cond:", conditional_embeddings[0, :3])
 
         return vision_outputs

From ca7f09222155487bd6e0d35f5822e64eb4af91e4 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 28 Oct 2022 18:12:04 +0300
Subject: [PATCH 07/47] Fix activation function of decoder

---
 .../models/clipseg/configuration_clipseg.py   |  29 +++-
 .../convert_clipseg_original_pytorch_to_hf.py |  37 ++++-
 .../models/clipseg/modeling_clipseg.py        | 155 ++++++++++++++++--
 3 files changed, 202 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index 552ed8e1a9034..b532ac5372506 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -17,7 +17,6 @@
 import copy
 import os
 from collections import OrderedDict
-from functools import reduce
 from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
 
 
@@ -270,8 +269,20 @@ class CLIPSegConfig(PretrainedConfig):
             The inital value of the *logit_scale* paramter. Default is used as per the original CLIPSeg implementation.
         extract_layers (`List[int]`, *optional*, defaults to [3, 6, 9]):
             Layers to extract when forwarding the query image through the frozen visual backbone of CLIP.
-        reduce_dim (`int`, *optional*, defaults to 128):
+        reduce_dim (`int`, *optional*, defaults to 64):
             Dimensionality to reduce the CLIP vision embedding.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 4):
+            Number of attention heads in the decoder of CLIPSeg.
+        decoder_attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        decoder_hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
+            defaults to 1e-5): The epsilon used by the layer normalization layers.
+        decoder_intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layers in the Transformer decoder.
+        conditional_layer (`int`, *optional*, defaults to 0):
+            ...
         kwargs (*optional*):
             Dictionary of keyword arguments.
 
@@ -308,7 +319,12 @@ def __init__(
         projection_dim=512,
         logit_scale_init_value=2.6592,
         extract_layers=[3, 6, 9],
-        reduce_dim=128,
+        reduce_dim=64,
+        decoder_num_attention_heads=4,
+        decoder_attention_dropout=0.0,
+        decoder_hidden_act="quick_gelu",
+        decoder_intermediate_size=2048,
+        conditional_layer=0,
         **kwargs
     ):
         super().__init__(text_config_dict=text_config_dict, vision_config_dict=vision_config_dict, **kwargs)
@@ -326,8 +342,13 @@ def __init__(
 
         self.projection_dim = projection_dim
         self.logit_scale_init_value = logit_scale_init_value
-        self.reduce_dim = reduce_dim
         self.extract_layers = extract_layers
+        self.reduce_dim = reduce_dim
+        self.decoder_num_attention_heads = decoder_num_attention_heads
+        self.decoder_attention_dropout = decoder_attention_dropout
+        self.decoder_hidden_act = decoder_hidden_act
+        self.decoder_intermediate_size = decoder_intermediate_size
+        self.conditional_layer = conditional_layer
         self.initializer_factor = 1.0
 
     @classmethod
diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index a358b6a20523d..a32a2a98f5c61 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -32,7 +32,7 @@ def rename_key(name):
         name = name.replace("c_fc", "fc1")
     if "c_proj" in name:
         name = name.replace("c_proj", "fc2")
-    if "attn" in name:
+    if "attn" in name and "self" not in name:
         name = name.replace("attn", "self_attn")
     # text encoder
     if "token_embedding" in name:
@@ -57,6 +57,21 @@ def rename_key(name):
         name = name.replace("visual.proj", "visual_projection.weight")
     if "text_projection" in name:
         name = name.replace("text_projection", "text_projection.weight")
+    # decoder
+    if "trans_conv" in name:
+        name = name.replace("trans_conv", "transposed_convolution")
+    if "film_mul" in name or "film_add" in name or "reduce" in name or "transposed_convolution" in name:
+        name = "decoder." + name
+    if "blocks" in name:
+        name = name.replace("blocks", "decoder.layers")
+    if "linear1" in name:
+        name = name.replace("linear1", "mlp.fc1")
+    if "linear2" in name:
+        name = name.replace("linear2", "mlp.fc2")
+    if "norm1" in name and "layer_" not in name:
+        name = name.replace("norm1", "layer_norm1")
+    if "norm2" in name and "layer_" not in name:
+        name = name.replace("norm2", "layer_norm2")
 
     return name
 
@@ -88,6 +103,18 @@ def convert_state_dict(orig_state_dict, config):
                     dim : dim * 2
                 ]
                 orig_state_dict[f"clipseg.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
+        elif "self_attn" in key and "out_proj" not in key:
+            key_split = key.split(".")
+            layer_num = int(key_split[1])
+            dim = config.reduce_dim
+            if "weight" in key:
+                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
+                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[dim : dim * 2, :]
+                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
+            else:
+                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
+                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
+                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
         else:
             new_name = rename_key(key)
             if "visual_projection" in new_name or "text_projection" in new_name:
@@ -116,16 +143,16 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path):
         if key.startswith("model"):
             state_dict.pop(key, None)
 
-    print("ORIGINAL STATE DICT")
-    for name, param in state_dict.items():
-        print(name, param.shape)
-
     state_dict = convert_state_dict(state_dict, config)
     missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
 
     print("Missing keys:", missing_keys)
     print("Unexpected keys:", unexpected_keys)
 
+    print("QUERIES of first decoder block")
+    print(model.decoder.layers[0].self_attn.q_proj.weight.shape)
+    print(model.decoder.layers[0].self_attn.q_proj.weight[:3,:3])
+
     # TODO create feature extractor
     # feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(model_name.replace("_", "-")))
     image = Image.open("/Users/nielsrogge/Documents/cats.jpg").convert("RGB")
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index b9d9a4a9ffe8a..7d6e1b11a64b9 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -14,7 +14,8 @@
 # limitations under the License.
 """ PyTorch CLIPSeg model."""
 
-
+import copy
+import math
 from dataclasses import dataclass
 from typing import Any, Optional, Tuple, Union
 
@@ -207,9 +208,14 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         causal_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
+        print_values=False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
 
+        if print_values:
+            print("Shape of initial queries:", hidden_states.shape)
+            print("First values of initial queries:", hidden_states[0,:3,:3])
+        
         bsz, tgt_len, embed_dim = hidden_states.size()
 
         # get query proj
@@ -222,6 +228,11 @@ def forward(
         key_states = key_states.view(*proj_shape)
         value_states = value_states.view(*proj_shape)
 
+        if print_values:
+            print("Shape of q:", query_states.shape)
+            print("First values of q:", query_states[:3,0,:3])
+            print("First values of k:", key_states[:3,0,:3])
+
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
 
@@ -312,6 +323,7 @@ def forward(
         attention_mask: torch.Tensor,
         causal_attention_mask: torch.Tensor,
         output_attentions: Optional[bool] = False,
+        print_values=False,
     ) -> Tuple[torch.FloatTensor]:
         """
         Args:
@@ -1092,13 +1104,93 @@ def forward(
         )
 
 
+class CLIPSegDecoderLayer(nn.Module):
+    """
+    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied
+    after self-attention/MLP, rather than before.
+    """
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer.__init__
+    def __init__(self, config: CLIPSegConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPSegAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.mlp = CLIPSegMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+        print_values=False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        if print_values:
+            print("Hidden states before self-attention:", hidden_states[0,:3,:3])
+
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            print_values=print_values,
+        )
+
+        if print_values:
+            print("Hidden states after self-attention:", hidden_states[0,:3,:3])
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+
+        if print_values:
+            print("Hidden states after first norm + residual:", hidden_states[0,:3,:3])
+
+        residual = hidden_states
+        
+        if print_values:
+            print("Hidden states before MLP:", hidden_states[0,:3,:3])
+        
+        hidden_states = self.mlp(hidden_states)
+
+        if print_values:
+            print("Hidden states after MLP:", hidden_states[0,:3,:3])
+
+        hidden_states = residual + hidden_states
+
+        hidden_states = self.layer_norm2(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
 class CLIPSegDecoder(CLIPSegPreTrainedModel):
     def __init__(self, config: CLIPSegConfig):
         super().__init__(config)
 
+        self.conditional_layer = config.conditional_layer
+
         self.film_mul = nn.Linear(config.projection_dim, config.reduce_dim)
         self.film_add = nn.Linear(config.projection_dim, config.reduce_dim)
 
+        # TODO remove, this is probably not used
         self.reduce = nn.Linear(config.vision_config.hidden_size, config.reduce_dim)
 
         self.transposed_convolution = nn.ConvTranspose2d(
@@ -1106,13 +1198,49 @@ def __init__(self, config: CLIPSegConfig):
         )
 
         depth = len(config.extract_layers)
-        self.reduces = nn.ModuleList([nn.Linear(768, config.reduce_dim) for _ in range(depth)])
-        # self.blocks = nn.ModuleList(
-        #     [nn.TransformerEncoderLayer(d_model=reduce_dim, nhead=n_heads) for _ in range(len(self.extract_layers))]
-        # )
+        self.reduces = nn.ModuleList(
+            [nn.Linear(config.vision_config.hidden_size, config.reduce_dim) for _ in range(depth)]
+        )
 
-    def forward(self, hidden_states):
-        return -1
+        decoder_config = copy.deepcopy(config.vision_config)
+        decoder_config.hidden_size = config.reduce_dim
+        decoder_config.num_attention_heads = config.decoder_num_attention_heads
+        decoder_config.intermediate_size = config.decoder_intermediate_size
+        decoder_config.hidden_act = "relu"
+        self.layers = nn.ModuleList([CLIPSegDecoderLayer(decoder_config) for _ in range(len(config.extract_layers))])
+
+    def forward(self, hidden_states, conditional_embeddings):
+        # TODO probably just not include the first hidden states
+        activations = hidden_states[1:]
+        _activations = activations[::-1]
+
+        a = None
+        for i, (activation, layer, reduce) in enumerate(zip(_activations, self.layers, self.reduces)):
+            if a is not None:
+                a = reduce(activation) + a
+            else:
+                a = reduce(activation)
+
+            if i == self.conditional_layer:
+                a = self.film_mul(conditional_embeddings) * a.permute(1, 0, 2) + self.film_add(conditional_embeddings)
+                a = a.permute(1, 0, 2)
+
+            if i == 0:
+                print(f"Activation before layer {i}:", a[0,:3,:3])
+            a = layer(a, attention_mask=None, causal_attention_mask=None, print_values=False)[0]
+            if i == 0:
+                print(f"Activation after layer {i}:", a[0,:3,:3])
+        
+        a = a[1:].permute(1, 2, 0)  # remove cls token and reshape to [batch_size, features, tokens]
+
+        size = int(math.sqrt(a.shape[2]))
+
+        batch_size = conditional_embeddings.shape[0]
+        a = a.view(batch_size, a.shape[1], size, size)
+
+        a = self.transposed_convolution(a)
+
+        return a
 
 
 class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel):
@@ -1125,7 +1253,6 @@ def __init__(self, config: CLIPSegConfig):
         self.clipseg = CLIPSegModel(config)
         self.extract_layers = config.extract_layers
 
-        # TODO: decoder
         self.decoder = CLIPSegDecoder(config)
 
         # Initialize weights and apply final processing
@@ -1185,12 +1312,20 @@ def forward(
             # we add +1 here as the hidden states also include the initial embeddings
             activations = [vision_outputs.hidden_states[i + 1] for i in [0] + self.extract_layers]
 
+            # for idx, act in enumerate(activations):
+            #     print(f"First values of activations {idx}:", act[0,:3,:3])
+
         # step 2: compute conditional vector, either from text or images
         conditional_embeddings = self.get_conditional_embeddings(
             input_ids, conditional_pixel_values, batch_size=pixel_values.shape[0]
         )
 
-        print("Shape of cond:", conditional_embeddings.shape)
-        print("First values of cond:", conditional_embeddings[0, :3])
+        # TODO remove these assertions
+        expected_pooled_output = torch.tensor([0.2551, -0.8039, -0.1766])
+        assert torch.allclose(pooled_output[0, :3], expected_pooled_output, atol=1e-3)
+        expected_cond = torch.tensor([0.0548, 0.0067, -0.1543])
+        assert torch.allclose(conditional_embeddings[0, :3], expected_cond, atol=1e-3)
+
+        decoder_outputs = self.decoder(activations, conditional_embeddings)
 
         return vision_outputs

From 346246e07c5540c914c34bdc526a512017a05cbb Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 28 Oct 2022 18:17:11 +0300
Subject: [PATCH 08/47] Make decoder outputs match original implementation

---
 .../convert_clipseg_original_pytorch_to_hf.py |  2 +-
 .../models/clipseg/modeling_clipseg.py        | 45 +++++++++++--------
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index a32a2a98f5c61..17cb75b3c3dd3 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -151,7 +151,7 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path):
 
     print("QUERIES of first decoder block")
     print(model.decoder.layers[0].self_attn.q_proj.weight.shape)
-    print(model.decoder.layers[0].self_attn.q_proj.weight[:3,:3])
+    print(model.decoder.layers[0].self_attn.q_proj.weight[:3, :3])
 
     # TODO create feature extractor
     # feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(model_name.replace("_", "-")))
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 7d6e1b11a64b9..0a6063ce95b4f 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -214,8 +214,8 @@ def forward(
 
         if print_values:
             print("Shape of initial queries:", hidden_states.shape)
-            print("First values of initial queries:", hidden_states[0,:3,:3])
-        
+            print("First values of initial queries:", hidden_states[0, :3, :3])
+
         bsz, tgt_len, embed_dim = hidden_states.size()
 
         # get query proj
@@ -230,8 +230,8 @@ def forward(
 
         if print_values:
             print("Shape of q:", query_states.shape)
-            print("First values of q:", query_states[:3,0,:3])
-            print("First values of k:", key_states[:3,0,:3])
+            print("First values of q:", query_states[:3, 0, :3])
+            print("First values of k:", key_states[:3, 0, :3])
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -1106,8 +1106,8 @@ def forward(
 
 class CLIPSegDecoderLayer(nn.Module):
     """
-    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied
-    after self-attention/MLP, rather than before.
+    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
+    self-attention/MLP, rather than before.
     """
 
     # Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer.__init__
@@ -1140,7 +1140,7 @@ def forward(
         residual = hidden_states
 
         if print_values:
-            print("Hidden states before self-attention:", hidden_states[0,:3,:3])
+            print("Hidden states before self-attention:", hidden_states[0, :3, :3])
 
         hidden_states, attn_weights = self.self_attn(
             hidden_states=hidden_states,
@@ -1151,23 +1151,23 @@ def forward(
         )
 
         if print_values:
-            print("Hidden states after self-attention:", hidden_states[0,:3,:3])
+            print("Hidden states after self-attention:", hidden_states[0, :3, :3])
 
         hidden_states = residual + hidden_states
         hidden_states = self.layer_norm1(hidden_states)
 
         if print_values:
-            print("Hidden states after first norm + residual:", hidden_states[0,:3,:3])
+            print("Hidden states after first norm + residual:", hidden_states[0, :3, :3])
 
         residual = hidden_states
-        
+
         if print_values:
-            print("Hidden states before MLP:", hidden_states[0,:3,:3])
-        
+            print("Hidden states before MLP:", hidden_states[0, :3, :3])
+
         hidden_states = self.mlp(hidden_states)
 
         if print_values:
-            print("Hidden states after MLP:", hidden_states[0,:3,:3])
+            print("Hidden states after MLP:", hidden_states[0, :3, :3])
 
         hidden_states = residual + hidden_states
 
@@ -1225,13 +1225,14 @@ def forward(self, hidden_states, conditional_embeddings):
                 a = self.film_mul(conditional_embeddings) * a.permute(1, 0, 2) + self.film_add(conditional_embeddings)
                 a = a.permute(1, 0, 2)
 
-            if i == 0:
-                print(f"Activation before layer {i}:", a[0,:3,:3])
+            print(f"Activation before layer {i}:", a[0, :3, :3])
             a = layer(a, attention_mask=None, causal_attention_mask=None, print_values=False)[0]
-            if i == 0:
-                print(f"Activation after layer {i}:", a[0,:3,:3])
-        
-        a = a[1:].permute(1, 2, 0)  # remove cls token and reshape to [batch_size, features, tokens]
+            print(f"Activation after layer {i}:", a[0, :3, :3])
+
+        # seq, batch, feat
+        # now it should become batch, feat, seq
+
+        a = a[:, 1:, :].permute(0, 2, 1)  # remove cls token and reshape to [batch_size, reduce_dim, seq_len]
 
         size = int(math.sqrt(a.shape[2]))
 
@@ -1328,4 +1329,10 @@ def forward(
 
         decoder_outputs = self.decoder(activations, conditional_embeddings)
 
+        # TODO remove these assertions
+        expected_decoder_outputs = torch.tensor(
+            [[-4.2436, -4.2398, -4.2027], [-4.1997, -4.1958, -4.1688], [-4.1144, -4.0943, -4.0736]]
+        )
+        assert torch.allclose(decoder_outputs[0, 0, :3, :3], expected_decoder_outputs, atol=1e-3)
+
         return vision_outputs

From 8eac500563d6fe4ffc19ecd61cce17b6fb8cee62 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 28 Oct 2022 18:23:14 +0300
Subject: [PATCH 09/47] Make decoder outputs match original implementation

---
 src/transformers/models/clipseg/modeling_clipseg.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 0a6063ce95b4f..cb71f9dc6a066 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -1277,7 +1277,7 @@ def get_conditional_embeddings(self, input_ids, conditional_pixel_values, batch_
         elif conditional_pixel_values is not None:
             with torch.no_grad():
                 conditional_embeddings = self.clipseg.get_image_features(conditional_pixel_values)
-        # TODO support the use conditional directly
+        # TODO support the use of conditional directly
         # elif conditional is not None and type(conditional) == torch.Tensor and conditional.ndim == 2:
         #     cond = conditional
         else:
@@ -1313,9 +1313,6 @@ def forward(
             # we add +1 here as the hidden states also include the initial embeddings
             activations = [vision_outputs.hidden_states[i + 1] for i in [0] + self.extract_layers]
 
-            # for idx, act in enumerate(activations):
-            #     print(f"First values of activations {idx}:", act[0,:3,:3])
-
         # step 2: compute conditional vector, either from text or images
         conditional_embeddings = self.get_conditional_embeddings(
             input_ids, conditional_pixel_values, batch_size=pixel_values.shape[0]

From 7244954b6d5125d34c67a9a2b5e610354fbdd3f3 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 28 Oct 2022 18:28:40 +0300
Subject: [PATCH 10/47] Add more copied from statements

---
 .../models/clipseg/modeling_clipseg.py        | 28 +++++--------------
 utils/check_repo.py                           |  1 +
 2 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index cb71f9dc6a066..eb855ca7c0750 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -208,14 +208,9 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         causal_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
-        print_values=False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
 
-        if print_values:
-            print("Shape of initial queries:", hidden_states.shape)
-            print("First values of initial queries:", hidden_states[0, :3, :3])
-
         bsz, tgt_len, embed_dim = hidden_states.size()
 
         # get query proj
@@ -228,11 +223,6 @@ def forward(
         key_states = key_states.view(*proj_shape)
         value_states = value_states.view(*proj_shape)
 
-        if print_values:
-            print("Shape of q:", query_states.shape)
-            print("First values of q:", query_states[:3, 0, :3])
-            print("First values of k:", key_states[:3, 0, :3])
-
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
 
@@ -323,7 +313,6 @@ def forward(
         attention_mask: torch.Tensor,
         causal_attention_mask: torch.Tensor,
         output_attentions: Optional[bool] = False,
-        print_values=False,
     ) -> Tuple[torch.FloatTensor]:
         """
         Args:
@@ -620,6 +609,7 @@ def custom_forward(*inputs):
 
 
 class CLIPSegTextTransformer(nn.Module):
+    # Copied from transformers.models.clip.modeling_clip.CLIPTextTransformer.__init__ with CLIP->CLIPSeg
     def __init__(self, config: CLIPSegTextConfig):
         super().__init__()
         self.config = config
@@ -630,6 +620,7 @@ def __init__(self, config: CLIPSegTextConfig):
 
     @add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegTextConfig)
+    # Copied from transformers.models.clip.modeling_clip.CLIPTextTransformer.forward with clip->clipseg, CLIP->CLIPSeg
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -658,8 +649,8 @@ def forward(
         hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
         bsz, seq_len = input_shape
-        # CLIPSEG's text model uses causal mask, prepare it here.
-        # https://github.com/openai/CLIPSEG/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clipseg/model.py#L324
+        # CLIPSeg's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIPSeg/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clipseg/model.py#L324
         causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
             hidden_states.device
         )
@@ -763,6 +754,7 @@ def forward(
 
 
 class CLIPSegVisionTransformer(nn.Module):
+    # Copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer.__init__ with CLIP->CLIPSeg
     def __init__(self, config: CLIPSegVisionConfig):
         super().__init__()
         self.config = config
@@ -775,6 +767,7 @@ def __init__(self, config: CLIPSegVisionConfig):
 
     @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig)
+    # Copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward
     def forward(
         self,
         pixel_values: Optional[torch.FloatTensor] = None,
@@ -798,9 +791,6 @@ def forward(
         hidden_states = self.embeddings(pixel_values)
         hidden_states = self.pre_layrnorm(hidden_states)
 
-        print("Shape of hidden states before Transformer encoder:", hidden_states.shape)
-        print("First values of hidden states before Transformer encoder:", hidden_states[0, :3, :3])
-
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
             output_attentions=output_attentions,
@@ -812,9 +802,6 @@ def forward(
         pooled_output = last_hidden_state[:, 0, :]
         pooled_output = self.post_layernorm(pooled_output)
 
-        print("Shape of pooled output:", pooled_output.shape)
-        print("First values of pooled output:", pooled_output[0, :3])
-
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
 
@@ -1110,7 +1097,7 @@ class CLIPSegDecoderLayer(nn.Module):
     self-attention/MLP, rather than before.
     """
 
-    # Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer.__init__
+    # Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer.__init__ with CLIP->CLIPSeg
     def __init__(self, config: CLIPSegConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
@@ -1147,7 +1134,6 @@ def forward(
             attention_mask=attention_mask,
             causal_attention_mask=causal_attention_mask,
             output_attentions=output_attentions,
-            print_values=print_values,
         )
 
         if print_values:
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 04ebba0dfa4ae..8b02185fa9bd5 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -46,6 +46,7 @@
 # Being in this list is an exception and should **not** be the rule.
 IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
     # models to ignore for not tested
+    "CLIPSegDecoder",  # Building part of bigger (tested) model.
     "TableTransformerEncoder",  # Building part of bigger (tested) model.
     "TableTransformerDecoder",  # Building part of bigger (tested) model.
     "TimeSeriesTransformerEncoder",  # Building part of bigger (tested) model.

From 38239eca25905347b482220af9290dac09d0cdd1 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 28 Oct 2022 19:20:18 +0300
Subject: [PATCH 11/47] Improve model outputs

---
 .../convert_clipseg_original_pytorch_to_hf.py |  20 ++--
 .../models/clipseg/modeling_clipseg.py        | 109 ++++++++++++------
 2 files changed, 83 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 17cb75b3c3dd3..4430b50c82191 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -149,10 +149,6 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path):
     print("Missing keys:", missing_keys)
     print("Unexpected keys:", unexpected_keys)
 
-    print("QUERIES of first decoder block")
-    print(model.decoder.layers[0].self_attn.q_proj.weight.shape)
-    print(model.decoder.layers[0].self_attn.q_proj.weight[:3, :3])
-
     # TODO create feature extractor
     # feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(model_name.replace("_", "-")))
     image = Image.open("/Users/nielsrogge/Documents/cats.jpg").convert("RGB")
@@ -163,13 +159,17 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path):
     # input_ids = CLIPTokenizer(prompts, padding="max_length", return_tensors="pt")
     input_ids = torch.tensor([[1, 2] + [9] * 75]).repeat(4, 1)
 
-    print("Shape of pixel values:", pixel_values.shape)
-    print("Shape of input ids:", input_ids.shape)
-
-    outputs = model(input_ids, pixel_values)
-    print(outputs.keys())
+    with torch.no_grad():
+        outputs = model(input_ids, pixel_values)
 
-    # assert torch.allclose(outputs, expected_slice, atol=1e-3)
+    # verify values
+    expected_masks_slice = torch.tensor(
+        [[-4.2436, -4.2398, -4.2027], [-4.1997, -4.1958, -4.1688], [-4.1144, -4.0943, -4.0736]]
+    )
+    assert torch.allclose(outputs.predicted_masks[0, 0, :3, :3], expected_masks_slice, atol=1e-3)
+    expected_cond = torch.tensor([0.0548, 0.0067, -0.1543])
+    assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_cond, atol=1e-3)
+    print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
         print(f"Saving model to {pytorch_dump_folder_path}")
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index eb855ca7c0750..fc827ded9bdf1 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -113,6 +113,30 @@ def to_tuple(self) -> Tuple[Any]:
         )
 
 
+@dataclass
+class CLIPSegImageSegmentationOutput(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        ...
+        vision_model_output (`BaseModelOutputWithPooling`):
+            The output of the [`CLIPSegVisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    predicted_masks: torch.FloatTensor = None
+    conditional_embeddings: torch.FloatTensor = None
+    pooled_output: torch.FloatTensor = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
 # Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->CLIPSeg
 class CLIPSegVisionEmbeddings(nn.Module):
     def __init__(self, config: CLIPSegVisionConfig):
@@ -1211,12 +1235,7 @@ def forward(self, hidden_states, conditional_embeddings):
                 a = self.film_mul(conditional_embeddings) * a.permute(1, 0, 2) + self.film_add(conditional_embeddings)
                 a = a.permute(1, 0, 2)
 
-            print(f"Activation before layer {i}:", a[0, :3, :3])
             a = layer(a, attention_mask=None, causal_attention_mask=None, print_values=False)[0]
-            print(f"Activation after layer {i}:", a[0, :3, :3])
-
-        # seq, batch, feat
-        # now it should become batch, feat, seq
 
         a = a[:, 1:, :].permute(0, 2, 1)  # remove cls token and reshape to [batch_size, reduce_dim, seq_len]
 
@@ -1245,40 +1264,41 @@ def __init__(self, config: CLIPSegConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def get_conditional_embeddings(self, input_ids, conditional_pixel_values, batch_size):
-        # conditional can be either in the form of text, an image or an existing embedding tensor
-        # so either input_ids, pixel_values or existing embeddings
-
-        # # compute conditional from a single string
-        # if conditional is not None and type(conditional) == str:
-        #     cond = self.compute_conditional(conditional)
-        #     cond = cond.repeat(batch_size, 1)
-
-        # compute conditional from text
+    def get_conditional_embeddings(
+        self,
+        input_ids: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        conditional_pixel_values: Optional[torch.Tensor] = None,
+        batch_size: Optional[int] = None,
+    ):
+        # compute conditional embeddings from texts
         if input_ids is not None:
             if len(input_ids) != batch_size:
                 raise ValueError("Make sure to pass as many texts as there are query images")
-            conditional_embeddings = self.clipseg.get_text_features(input_ids)
-        # compute conditional from image
+            conditional_embeddings = self.clipseg.get_text_features(
+                input_ids, attention_mask=attention_mask, position_ids=position_ids
+            )
+        # compute conditional embeddings from images
         elif conditional_pixel_values is not None:
             with torch.no_grad():
                 conditional_embeddings = self.clipseg.get_image_features(conditional_pixel_values)
-        # TODO support the use of conditional directly
-        # elif conditional is not None and type(conditional) == torch.Tensor and conditional.ndim == 2:
-        #     cond = conditional
         else:
-            raise ValueError("invalid conditional")
+            raise ValueError(
+                "Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`"
+            )
 
         return conditional_embeddings
 
     def forward(
         self,
-        input_ids: Optional[torch.LongTensor] = None,
+        input_ids: Optional[torch.FloatTensor] = None,
         pixel_values: Optional[torch.FloatTensor] = None,
         conditional_pixel_values: Optional[torch.FloatTensor] = None,
+        conditional_embeddings: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        return_loss: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1299,23 +1319,40 @@ def forward(
             # we add +1 here as the hidden states also include the initial embeddings
             activations = [vision_outputs.hidden_states[i + 1] for i in [0] + self.extract_layers]
 
-        # step 2: compute conditional vector, either from text or images
-        conditional_embeddings = self.get_conditional_embeddings(
-            input_ids, conditional_pixel_values, batch_size=pixel_values.shape[0]
-        )
+        # step 2: compute conditional embeddings, either from text, images or an own provided embedding
+        if conditional_embeddings is None:
+            conditional_embeddings = self.get_conditional_embeddings(
+                input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                conditional_pixel_values=conditional_pixel_values,
+                batch_size=pixel_values.shape[0],
+            )
+        else:
+            if (not isinstance(conditional_embeddings, torch.Tensor)) or (conditional_embeddings.ndim != 2):
+                raise ValueError("Make sure to pass conditional embeddings as a two-dimensional tensor")
 
         # TODO remove these assertions
         expected_pooled_output = torch.tensor([0.2551, -0.8039, -0.1766])
         assert torch.allclose(pooled_output[0, :3], expected_pooled_output, atol=1e-3)
-        expected_cond = torch.tensor([0.0548, 0.0067, -0.1543])
-        assert torch.allclose(conditional_embeddings[0, :3], expected_cond, atol=1e-3)
 
-        decoder_outputs = self.decoder(activations, conditional_embeddings)
+        predicted_masks = self.decoder(activations, conditional_embeddings)
 
-        # TODO remove these assertions
-        expected_decoder_outputs = torch.tensor(
-            [[-4.2436, -4.2398, -4.2027], [-4.1997, -4.1958, -4.1688], [-4.1144, -4.0943, -4.0736]]
-        )
-        assert torch.allclose(decoder_outputs[0, 0, :3, :3], expected_decoder_outputs, atol=1e-3)
+        if output_hidden_states:
+            raise NotImplementedError("To do")
+
+        loss = None
+        if labels is not None:
+            loss = loss_fn(predicted_masks, labels)
+
+        if not return_dict:
+            output = (predicted_masks, conditional_embeddings, pooled_output) + vision_outputs
+            return ((loss,) + output) if loss is not None else output
 
-        return vision_outputs
+        return CLIPSegImageSegmentationOutput(
+            loss=loss,
+            predicted_masks=predicted_masks,
+            conditional_embeddings=conditional_embeddings,
+            pooled_output=pooled_output,
+            vision_model_output=vision_outputs,
+        )

From 0d94e8fb7eadf5ad89604631c0d138a56e611975 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 28 Oct 2022 19:28:15 +0300
Subject: [PATCH 12/47] Fix auto tokenizer file

---
 src/transformers/models/auto/tokenization_auto.py           | 1 +
 .../clipseg/convert_clipseg_original_pytorch_to_hf.py       | 2 ++
 src/transformers/models/clipseg/modeling_clipseg.py         | 6 ++----
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 65c0c6a5ca509..d5374e6f42e00 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -99,6 +99,7 @@
                     "CLIPTokenizer",
                     "CLIPTokenizerFast" if is_tokenizers_available() else None,
                 ),
+            ),
             ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
             ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
             (
diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 4430b50c82191..45a09de5644b3 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -169,6 +169,8 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path):
     assert torch.allclose(outputs.predicted_masks[0, 0, :3, :3], expected_masks_slice, atol=1e-3)
     expected_cond = torch.tensor([0.0548, 0.0067, -0.1543])
     assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_cond, atol=1e-3)
+    expected_pooled_output = torch.tensor([0.2551, -0.8039, -0.1766])
+    assert torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3)
     print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index fc827ded9bdf1..02afe1ed75280 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -1332,10 +1332,6 @@ def forward(
             if (not isinstance(conditional_embeddings, torch.Tensor)) or (conditional_embeddings.ndim != 2):
                 raise ValueError("Make sure to pass conditional embeddings as a two-dimensional tensor")
 
-        # TODO remove these assertions
-        expected_pooled_output = torch.tensor([0.2551, -0.8039, -0.1766])
-        assert torch.allclose(pooled_output[0, :3], expected_pooled_output, atol=1e-3)
-
         predicted_masks = self.decoder(activations, conditional_embeddings)
 
         if output_hidden_states:
@@ -1343,6 +1339,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # TODO check whether this is correct
+            loss_fn = nn.BCELoss()
             loss = loss_fn(predicted_masks, labels)
 
         if not return_dict:

From bdf01aaca72c3d794fc3dd856f6c5409c0442bc1 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 28 Oct 2022 19:38:28 +0300
Subject: [PATCH 13/47] Fix more tests

---
 .../models/clipseg/modeling_clipseg.py        |  3 +-
 tests/models/clipseg/test_modeling_clipseg.py | 30 +++++++++++++++++--
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 02afe1ed75280..ffeb36b951dcf 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -1317,7 +1317,8 @@ def forward(
             pooled_output = self.clipseg.visual_projection(vision_outputs[1])
 
             # we add +1 here as the hidden states also include the initial embeddings
-            activations = [vision_outputs.hidden_states[i + 1] for i in [0] + self.extract_layers]
+            hidden_states = vision_outputs.hidden_states if return_dict else vision_outputs[2]
+            activations = [hidden_states[i + 1] for i in [0] + self.extract_layers]
 
         # step 2: compute conditional embeddings, either from text, images or an own provided embedding
         if conditional_embeddings is None:
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 27dfbbb2f5aeb..ace8a9665e55c 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -24,7 +24,8 @@
 
 import requests
 import transformers
-from transformers import CLIPSegConfig, CLIPSegTextConfig, CLIPSegVisionConfig
+from transformers.models.auto import get_values
+from transformers import CLIPSegConfig, CLIPSegTextConfig, CLIPSegVisionConfig, MODEL_MAPPING
 from transformers.testing_utils import (
     is_flax_available,
     is_pt_flax_cross_test,
@@ -362,7 +363,8 @@ def prepare_config_and_inputs(self):
 
     def get_config(self):
         return CLIPSegConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64, reduce_dim=32,
+            extract_layers=[1,2,3],
         )
 
     def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
@@ -383,7 +385,7 @@ def prepare_config_and_inputs_for_common(self):
             "input_ids": input_ids,
             "attention_mask": attention_mask,
             "pixel_values": pixel_values,
-            "return_loss": True,
+            # "return_loss": True,
         }
         return config, inputs_dict
 
@@ -636,6 +638,28 @@ def test_equivalence_flax_to_pt(self):
                 for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
                     self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
 
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+
+            if model_class in get_values(MODEL_MAPPING):
+                continue
+
+            print("Model class:", model_class)
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            for k,v in inputs.items():
+                print(k,v.shape)
+            loss = model(**inputs).loss
+            loss.backward()
+    
     @slow
     def test_model_from_pretrained(self):
         for model_name in CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:

From cbe5742a52d538f5bcf66cd57ba91392e05359d7 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 28 Oct 2022 19:47:57 +0300
Subject: [PATCH 14/47] Add test

---
 tests/models/clipseg/test_modeling_clipseg.py | 37 +++++++++++++++----
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index ace8a9665e55c..2b6bfafaafb3f 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -24,8 +24,8 @@
 
 import requests
 import transformers
+from transformers import MODEL_MAPPING, CLIPSegConfig, CLIPSegTextConfig, CLIPSegVisionConfig
 from transformers.models.auto import get_values
-from transformers import CLIPSegConfig, CLIPSegTextConfig, CLIPSegVisionConfig, MODEL_MAPPING
 from transformers.testing_utils import (
     is_flax_available,
     is_pt_flax_cross_test,
@@ -363,8 +363,11 @@ def prepare_config_and_inputs(self):
 
     def get_config(self):
         return CLIPSegConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64, reduce_dim=32,
-            extract_layers=[1,2,3],
+            self.text_model_tester.get_config(),
+            self.vision_model_tester.get_config(),
+            projection_dim=64,
+            reduce_dim=32,
+            extract_layers=[1, 2, 3],
         )
 
     def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
@@ -378,6 +381,23 @@ def create_and_check_model(self, config, input_ids, attention_mask, pixel_values
             result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
         )
 
+    def create_and_check_model_for_image_segmentation(self, config, input_ids, attention_maks, pixel_values):
+        model = CLIPSegForImageSegmentation(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values)
+        self.parent.assertEqual(
+            result.predicted_masks.shape,
+            (
+                self.vision_model_tester.batch_size,
+                1,
+                self.vision_model_tester.image_size,
+                self.vision_model_tester.image_size,
+            ),
+        )
+        self.parent.assertEqual(
+            result.conditional_embeddings.shape, (self.text_model_tester.batch_size, config.projection_dim)
+        )
+
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, input_ids, attention_mask, pixel_values = config_and_inputs
@@ -385,7 +405,6 @@ def prepare_config_and_inputs_for_common(self):
             "input_ids": input_ids,
             "attention_mask": attention_mask,
             "pixel_values": pixel_values,
-            # "return_loss": True,
         }
         return config, inputs_dict
 
@@ -413,6 +432,10 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_model_for_image_segmentation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_for_image_segmentation(*config_and_inputs)
+
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
@@ -655,11 +678,11 @@ def test_training(self):
             model.to(torch_device)
             model.train()
             inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            for k,v in inputs.items():
-                print(k,v.shape)
+            for k, v in inputs.items():
+                print(k, v.shape)
             loss = model(**inputs).loss
             loss.backward()
-    
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:

From 788e8ead7fb9a34d696f0fc4d6feeed1c299c342 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 28 Oct 2022 20:10:07 +0300
Subject: [PATCH 15/47] Improve README and docs, improve conditional embeddings

---
 README.md                                     |  1 +
 README_es.md                                  |  1 +
 README_ko.md                                  |  1 +
 README_zh-hans.md                             |  1 +
 README_zh-hant.md                             |  1 +
 docs/source/en/_toctree.yml                   |  2 ++
 docs/source/en/index.mdx                      |  1 +
 docs/source/en/model_doc/clipseg.mdx          | 29 +++++++++++++---
 .../models/clipseg/modeling_clipseg.py        | 34 ++++++++++++-------
 9 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 07eec4d8a1b1d..42bb91a5a6246 100644
--- a/README.md
+++ b/README.md
@@ -279,6 +279,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [Image Segmentation Using Text and Image Prompts](<FILL ARKIV LINK>) by Timo Lüddecke and Alexander Ecker.
 1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
diff --git a/README_es.md b/README_es.md
index 579b2da1b7cc7..fd8b60c4dd3b7 100644
--- a/README_es.md
+++ b/README_es.md
@@ -279,6 +279,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [Image Segmentation Using Text and Image Prompts](<FILL ARKIV LINK>) by Timo Lüddecke and Alexander Ecker.
 1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
diff --git a/README_ko.md b/README_ko.md
index 30056ff35a6fd..abcd1cf6905f9 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -229,6 +229,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [Image Segmentation Using Text and Image Prompts](<FILL ARKIV LINK>) by Timo Lüddecke and Alexander Ecker.
 1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 891245e343a59..8a828b6280e10 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -253,6 +253,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
+1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (来自 <FILL INSTITUTION>) 伴随论文 [Image Segmentation Using Text and Image Prompts](<FILL ARKIV LINK>) 由 Timo Lüddecke and Alexander Ecker 发布。
 1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index b65177be21f7a..4ead564bc5b9d 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -265,6 +265,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [Image Segmentation Using Text and Image Prompts](<FILL ARKIV LINK>) by Timo Lüddecke and Alexander Ecker.
 1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index a6706cb774664..1cd287130db48 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -466,6 +466,8 @@
       sections:
       - local: model_doc/clip
         title: CLIP
+      - local: model_doc/clipseg
+        title: CLIPSeg
       - local: model_doc/data2vec
         title: Data2Vec
       - local: model_doc/donut
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 2a3243787c159..87fc718ba5a36 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -67,6 +67,7 @@ The documentation is organized into five sections:
 1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSeg](model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [Image Segmentation Using Text and Image Prompts](<FILL ARKIV LINK>) by Timo Lüddecke and Alexander Ecker.
 1. **[CLIPSegSeg](model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
diff --git a/docs/source/en/model_doc/clipseg.mdx b/docs/source/en/model_doc/clipseg.mdx
index 5cb283784d2d4..69aa678d16fd2 100644
--- a/docs/source/en/model_doc/clipseg.mdx
+++ b/docs/source/en/model_doc/clipseg.mdx
@@ -14,18 +14,37 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The CLIPSeg model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
+The CLIPSeg model was proposed in [Image Segmentation Using Text and Image Prompts](<INSERT PAPER LINK HERE>) by Timo Lüddecke
+and Alexander Ecker. CLIPSeg adds a minimal decoder on top of a frozen [CLIP](clip) model for zero- and one-shot image segmentation.
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+*Image segmentation is usually addressed by training a
+model for a fixed set of object classes. Incorporating additional classes or more complex queries later is expensive
+as it requires re-training the model on a dataset that encompasses these expressions. Here we propose a system
+that can generate image segmentations based on arbitrary
+prompts at test time. A prompt can be either a text or an
+image. This approach enables us to create a unified model
+(trained once) for three common segmentation tasks, which
+come with distinct challenges: referring expression segmentation, zero-shot segmentation and one-shot segmentation.
+We build upon the CLIP model as a backbone which we extend with a transformer-based decoder that enables dense
+prediction. After training on an extended version of the
+PhraseCut dataset, our system generates a binary segmentation map for an image based on a free-text prompt or on
+an additional image expressing the query. We analyze different variants of the latter image-based prompts in detail.
+This novel hybrid input allows for dynamic adaptation not
+only to the three segmentation tasks mentioned above, but
+to any binary segmentation task where a text or image query
+can be formulated. Finally, we find our system to adapt well
+to generalized queries involving affordances or properties*
 
 Tips:
 
-<INSERT TIPS ABOUT MODEL HERE>
+- [`CLIPSegForImageSegmentation`] adds the decoder on top of [`CLIPSegModel`]. The latter is identical to [`CLIPModel`].
+- [`CLIPSegForImageSegmentation`] can generate image segmentations based on arbitrary prompts at test time. A prompt can be either a text
+(provided to the model as `input_ids`) or an image (provided to the model as `conditional_pixel_values`). One can also provide a custom
+conditional embeddings (provided to the model as `conditional_embeddings`).
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index ffeb36b951dcf..1061bbebf2e59 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -1266,21 +1266,24 @@ def __init__(self, config: CLIPSegConfig):
 
     def get_conditional_embeddings(
         self,
-        input_ids: Optional[torch.Tensor],
+        batch_size: int = None,
+        input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         conditional_pixel_values: Optional[torch.Tensor] = None,
-        batch_size: Optional[int] = None,
     ):
-        # compute conditional embeddings from texts
         if input_ids is not None:
+            # compute conditional embeddings from texts
             if len(input_ids) != batch_size:
-                raise ValueError("Make sure to pass as many texts as there are query images")
-            conditional_embeddings = self.clipseg.get_text_features(
-                input_ids, attention_mask=attention_mask, position_ids=position_ids
-            )
-        # compute conditional embeddings from images
+                raise ValueError("Make sure to pass as many prompt texts as there are query images")
+            with torch.no_grad():
+                conditional_embeddings = self.clipseg.get_text_features(
+                    input_ids, attention_mask=attention_mask, position_ids=position_ids
+                )
         elif conditional_pixel_values is not None:
+            # compute conditional embeddings from images
+            if len(conditional_pixel_values) != batch_size:
+                raise ValueError("Make sure to pass as many prompt images as there are query images")
             with torch.no_grad():
                 conditional_embeddings = self.clipseg.get_image_features(conditional_pixel_values)
         else:
@@ -1323,15 +1326,22 @@ def forward(
         # step 2: compute conditional embeddings, either from text, images or an own provided embedding
         if conditional_embeddings is None:
             conditional_embeddings = self.get_conditional_embeddings(
-                input_ids,
+                batch_size=pixel_values.shape[0],
+                input_ids=input_ids,
                 attention_mask=attention_mask,
                 position_ids=position_ids,
                 conditional_pixel_values=conditional_pixel_values,
-                batch_size=pixel_values.shape[0],
             )
         else:
-            if (not isinstance(conditional_embeddings, torch.Tensor)) or (conditional_embeddings.ndim != 2):
-                raise ValueError("Make sure to pass conditional embeddings as a two-dimensional tensor")
+            if conditional_embeddings.shape[0] != pixel_values.shape[0]:
+                raise ValueError(
+                    "Make sure to pass as many conditional embeddings as there are query images in the batch"
+                )
+            if conditional_embeddings.shape[1] != self.config.projection_dim:
+                raise ValueError(
+                    "Make sure that the feature dimension of the conditional embeddings matches"
+                    " `config.projection_dim`."
+                )
 
         predicted_masks = self.decoder(activations, conditional_embeddings)
 

From 61fab4b053a4523a47bdbc686ec775a77ef135b9 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 28 Oct 2022 20:24:54 +0300
Subject: [PATCH 16/47] Fix more tests

---
 .../models/clipseg/modeling_clipseg.py            |  2 ++
 tests/models/clipseg/test_modeling_clipseg.py     | 15 +++++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 1061bbebf2e59..7ff3842d6f19f 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -1255,6 +1255,8 @@ class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel):
     def __init__(self, config: CLIPSegConfig):
         super().__init__(config)
 
+        self.config = config
+        
         # TODO perhaps use clip here?
         self.clipseg = CLIPSegModel(config)
         self.extract_layers = config.extract_layers
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 2b6bfafaafb3f..ffc64f8a99851 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -148,7 +148,7 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class CLIPSegVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
-    Here we also overwrite some of the tests of test_modeling_common.py, as CLIPSEG does not use input_ids, inputs_embeds,
+    Here we also overwrite some of the tests of test_modeling_common.py, as CLIPSeg does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
@@ -167,7 +167,7 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="CLIPSEG does not use inputs_embeds")
+    @unittest.skip(reason="CLIPSeg does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
@@ -327,7 +327,7 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(reason="CLIPSEG does not use inputs_embeds")
+    @unittest.skip(reason="CLIPSeg does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
@@ -452,7 +452,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_common_attributes(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for CLIPSEG
+    # override as the some parameters require custom initialization
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -462,13 +462,16 @@ def test_initialization(self):
             for name, param in model.named_parameters():
                 if param.requires_grad:
                     # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
+                    if "logit_scale" in name:
                         self.assertAlmostEqual(
                             param.data.item(),
                             np.log(1 / 0.07),
                             delta=1e-3,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
+                    elif "film" in name or "transposed_conv" in name or "reduce" in name:
+                        # those parameters use PyTorch' default initialization scheme
+                        pass
                     else:
                         self.assertIn(
                             ((param.data.mean() * 1e9).round() / 1e9).item(),
@@ -490,7 +493,7 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             try:
                 input_ids = inputs_dict["input_ids"]
-                pixel_values = inputs_dict["pixel_values"]  # CLIPSEG needs pixel_values
+                pixel_values = inputs_dict["pixel_values"]  # CLIPSeg needs pixel_values
                 traced_model = torch.jit.trace(model, (input_ids, pixel_values))
             except RuntimeError:
                 self.fail("Couldn't trace module.")

From cdbc867b1440add6ef33225e147864c08e650621 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 28 Oct 2022 22:57:48 +0300
Subject: [PATCH 17/47] Remove print statements

---
 .../models/clipseg/modeling_clipseg.py        | 21 +------------------
 tests/models/clipseg/test_modeling_clipseg.py |  2 +-
 2 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 7ff3842d6f19f..0e6850c0e9f51 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -1136,7 +1136,6 @@ def forward(
         attention_mask: torch.Tensor,
         causal_attention_mask: torch.Tensor,
         output_attentions: Optional[bool] = False,
-        print_values=False,
     ) -> Tuple[torch.FloatTensor]:
         """
         Args:
@@ -1150,9 +1149,6 @@ def forward(
         """
         residual = hidden_states
 
-        if print_values:
-            print("Hidden states before self-attention:", hidden_states[0, :3, :3])
-
         hidden_states, attn_weights = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
@@ -1160,27 +1156,12 @@ def forward(
             output_attentions=output_attentions,
         )
 
-        if print_values:
-            print("Hidden states after self-attention:", hidden_states[0, :3, :3])
-
         hidden_states = residual + hidden_states
         hidden_states = self.layer_norm1(hidden_states)
 
-        if print_values:
-            print("Hidden states after first norm + residual:", hidden_states[0, :3, :3])
-
         residual = hidden_states
-
-        if print_values:
-            print("Hidden states before MLP:", hidden_states[0, :3, :3])
-
         hidden_states = self.mlp(hidden_states)
-
-        if print_values:
-            print("Hidden states after MLP:", hidden_states[0, :3, :3])
-
         hidden_states = residual + hidden_states
-
         hidden_states = self.layer_norm2(hidden_states)
 
         outputs = (hidden_states,)
@@ -1235,7 +1216,7 @@ def forward(self, hidden_states, conditional_embeddings):
                 a = self.film_mul(conditional_embeddings) * a.permute(1, 0, 2) + self.film_add(conditional_embeddings)
                 a = a.permute(1, 0, 2)
 
-            a = layer(a, attention_mask=None, causal_attention_mask=None, print_values=False)[0]
+            a = layer(a, attention_mask=None, causal_attention_mask=None)[0]
 
         a = a[:, 1:, :].permute(0, 2, 1)  # remove cls token and reshape to [batch_size, reduce_dim, seq_len]
 
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index ffc64f8a99851..5baaa83ef3a56 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -470,7 +470,7 @@ def test_initialization(self):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
                     elif "film" in name or "transposed_conv" in name or "reduce" in name:
-                        # those parameters use PyTorch' default initialization scheme
+                        # those parameters use PyTorch' default nn.Linear initialization scheme
                         pass
                     else:
                         self.assertIn(

From aea0d3b89f53110ca4fbf779cef77542468c9ad6 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 28 Oct 2022 23:01:09 +0300
Subject: [PATCH 18/47] Remove initial embeddings

---
 src/transformers/models/clipseg/modeling_clipseg.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 0e6850c0e9f51..1659188d244bc 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -1201,12 +1201,10 @@ def __init__(self, config: CLIPSegConfig):
         self.layers = nn.ModuleList([CLIPSegDecoderLayer(decoder_config) for _ in range(len(config.extract_layers))])
 
     def forward(self, hidden_states, conditional_embeddings):
-        # TODO probably just not include the first hidden states
-        activations = hidden_states[1:]
-        _activations = activations[::-1]
+        activations = hidden_states[::-1]
 
         a = None
-        for i, (activation, layer, reduce) in enumerate(zip(_activations, self.layers, self.reduces)):
+        for i, (activation, layer, reduce) in enumerate(zip(activations, self.layers, self.reduces)):
             if a is not None:
                 a = reduce(activation) + a
             else:
@@ -1237,7 +1235,7 @@ def __init__(self, config: CLIPSegConfig):
         super().__init__(config)
 
         self.config = config
-        
+
         # TODO perhaps use clip here?
         self.clipseg = CLIPSegModel(config)
         self.extract_layers = config.extract_layers
@@ -1302,9 +1300,9 @@ def forward(
             )
             pooled_output = self.clipseg.visual_projection(vision_outputs[1])
 
-            # we add +1 here as the hidden states also include the initial embeddings
             hidden_states = vision_outputs.hidden_states if return_dict else vision_outputs[2]
-            activations = [hidden_states[i + 1] for i in [0] + self.extract_layers]
+            # we add +1 here as the hidden states also include the initial embeddings
+            activations = [hidden_states[i + 1] for i in self.extract_layers]
 
         # step 2: compute conditional embeddings, either from text, images or an own provided embedding
         if conditional_embeddings is None:

From d041b96c34e2d8cc17fe182d122827b76613f045 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 31 Oct 2022 19:56:54 +0100
Subject: [PATCH 19/47] Improve conversion script

---
 .../clipseg/convert_clipseg_original_pytorch_to_hf.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 45a09de5644b3..9fb9573962a20 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -132,7 +132,7 @@ def convert_state_dict(orig_state_dict, config):
 )
 
 
-def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path):
+def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_hub):
     config = get_clipseg_config()
     model = CLIPSegForImageSegmentation(config)
     model.eval()
@@ -180,6 +180,10 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path):
         # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
         # feature_extractor.save_pretrained(pytorch_dump_folder_path)
 
+    if push_to_hub:
+        print("Pushing model to the hub")
+        model.push_to_hub("nielsr/clipseg-test")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -193,6 +197,9 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path):
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
     )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
 
     args = parser.parse_args()
-    convert_clipseg_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path)
+    convert_clipseg_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
\ No newline at end of file

From f627364ca42169e1708407b18e9cd680d4dd20da Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 1 Nov 2022 11:19:59 +0100
Subject: [PATCH 20/47] Add interpolation of position embeddings

---
 .../convert_clipseg_original_pytorch_to_hf.py | 11 +++++----
 .../models/clipseg/modeling_clipseg.py        | 24 +++++++++++++++++--
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 9fb9573962a20..09153c15426e8 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -4,12 +4,13 @@
 from PIL import Image
 from torchvision.transforms import Compose, Resize, ToTensor
 
-from transformers import CLIPSegConfig, CLIPSegForImageSegmentation
+from transformers import CLIPSegConfig, CLIPSegForImageSegmentation, CLIPSegTextConfig, CLIPSegVisionConfig
 
 
 def get_clipseg_config():
-    config = CLIPSegConfig()
-    config.vision_config.patch_size = 16
+    text_config = CLIPSegTextConfig()
+    vision_config = CLIPSegVisionConfig(patch_size=16)
+    config = CLIPSegConfig.from_text_vision_configs(text_config, vision_config)
     return config
 
 
@@ -139,10 +140,12 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_t
 
     state_dict = torch.load(checkpoint_path, map_location="cpu")
 
+    # remove some keys
     for key in state_dict.copy().keys():
         if key.startswith("model"):
             state_dict.pop(key, None)
 
+    # rename some keys
     state_dict = convert_state_dict(state_dict, config)
     missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
 
@@ -202,4 +205,4 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_t
     )
 
     args = parser.parse_args()
-    convert_clipseg_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
\ No newline at end of file
+    convert_clipseg_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 1659188d244bc..b47985187fa27 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -137,8 +137,8 @@ def to_tuple(self) -> Tuple[Any]:
         )
 
 
-# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->CLIPSeg
 class CLIPSegVisionEmbeddings(nn.Module):
+    # Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings.__init__
     def __init__(self, config: CLIPSegVisionConfig):
         super().__init__()
         self.config = config
@@ -157,6 +157,19 @@ def __init__(self, config: CLIPSegVisionConfig):
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
         self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
 
+    def interpolate_position_embeddings(self, new_size):
+        if len(new_size) != 2:
+            raise ValueError("new_size should consist of 2 values")
+
+        a = self.position_embedding.weight[1:].T.view(1, self.config.hidden_size, self.num_patches)
+        b = (
+            nn.functional.interpolate(a, new_size, mode="bicubic", align_corners=False)
+            .squeeze(0)
+            .view(768, new_size[0] * new_size[1])
+            .T
+        )
+        return torch.cat([self.model.positional_embedding[:1], b])
+
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
         patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
@@ -164,7 +177,14 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
         embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
-        embeddings = embeddings + self.position_embedding(self.position_ids)
+
+        if embeddings.shape[1] != self.num_positions:
+            new_shape = int(math.sqrt(embeddings.shape[1] - 1))
+            embeddings = embeddings + self.interpolate_position_embeddings((new_shape, new_shape))
+            embeddings = embeddings.to(embeddings.dtype)[None, :, :]
+        else:
+            embeddings = embeddings + self.position_embedding(self.position_ids)
+
         return embeddings
 
 

From 22fa35e334625494a17856de91b6efefd5100e2e Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 1 Nov 2022 11:42:49 +0100
Subject: [PATCH 21/47] Finish addition of interpolation of position embeddings

---
 .../convert_clipseg_original_pytorch_to_hf.py | 29 +++++++++++++++----
 .../models/clipseg/modeling_clipseg.py        | 14 ++++++---
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 09153c15426e8..d79618e274de6 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -1,3 +1,20 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Convert CLIPSeg checkpoints from the original repository. URL: https://github.com/timojl/clipseg."""
+
 import argparse
 
 import torch
@@ -128,7 +145,7 @@ def convert_state_dict(orig_state_dict, config):
 image_transforms = Compose(
     [
         ToTensor(),
-        Resize((224, 224)),
+        Resize((352, 352)),
     ]
 )
 
@@ -149,8 +166,10 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_t
     state_dict = convert_state_dict(state_dict, config)
     missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
 
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
+    if missing_keys != ["clipseg.text_model.embeddings.position_ids", "clipseg.vision_model.embeddings.position_ids"]:
+        raise ValueError("Missing keys that are not expected: {}".format(missing_keys))
+    if len(unexpected_keys) > 0:
+        raise ValueError(f"Unexpected keys: {unexpected_keys}")
 
     # TODO create feature extractor
     # feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(model_name.replace("_", "-")))
@@ -167,12 +186,12 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_t
 
     # verify values
     expected_masks_slice = torch.tensor(
-        [[-4.2436, -4.2398, -4.2027], [-4.1997, -4.1958, -4.1688], [-4.1144, -4.0943, -4.0736]]
+        [[-4.1992, -4.1912, -4.1523], [-4.1509, -4.1442, -4.1091], [-4.0581, -4.0355, -4.0107]]
     )
     assert torch.allclose(outputs.predicted_masks[0, 0, :3, :3], expected_masks_slice, atol=1e-3)
     expected_cond = torch.tensor([0.0548, 0.0067, -0.1543])
     assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_cond, atol=1e-3)
-    expected_pooled_output = torch.tensor([0.2551, -0.8039, -0.1766])
+    expected_pooled_output = torch.tensor([0.2208, -0.7577, -0.1391])
     assert torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3)
     print("Looks ok!")
 
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index b47985187fa27..e481d4ea7901b 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -161,14 +161,20 @@ def interpolate_position_embeddings(self, new_size):
         if len(new_size) != 2:
             raise ValueError("new_size should consist of 2 values")
 
-        a = self.position_embedding.weight[1:].T.view(1, self.config.hidden_size, self.num_patches)
+        num_patches_one_direction = int(self.num_patches**0.5)
+        # we interpolate the position embeddings in 2D
+        a = self.position_embedding.weight[1:].T.view(
+            1, self.config.hidden_size, num_patches_one_direction, num_patches_one_direction
+        )
         b = (
             nn.functional.interpolate(a, new_size, mode="bicubic", align_corners=False)
             .squeeze(0)
-            .view(768, new_size[0] * new_size[1])
+            .view(self.config.hidden_size, new_size[0] * new_size[1])
             .T
         )
-        return torch.cat([self.model.positional_embedding[:1], b])
+        result = torch.cat([self.position_embedding.weight[:1], b])
+
+        return result
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
@@ -181,7 +187,7 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         if embeddings.shape[1] != self.num_positions:
             new_shape = int(math.sqrt(embeddings.shape[1] - 1))
             embeddings = embeddings + self.interpolate_position_embeddings((new_shape, new_shape))
-            embeddings = embeddings.to(embeddings.dtype)[None, :, :]
+            embeddings = embeddings.to(embeddings.dtype)
         else:
             embeddings = embeddings + self.position_embedding(self.position_ids)
 

From 4f1157177c3467e637bece93dd51d8b3a93deb40 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 1 Nov 2022 12:20:36 +0100
Subject: [PATCH 22/47] Add support for refined checkpoint

---
 .../convert_clipseg_original_pytorch_to_hf.py  | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index d79618e274de6..ae69a11470262 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -185,13 +185,19 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_t
         outputs = model(input_ids, pixel_values)
 
     # verify values
-    expected_masks_slice = torch.tensor(
-        [[-4.1992, -4.1912, -4.1523], [-4.1509, -4.1442, -4.1091], [-4.0581, -4.0355, -4.0107]]
-    )
-    assert torch.allclose(outputs.predicted_masks[0, 0, :3, :3], expected_masks_slice, atol=1e-3)
     expected_cond = torch.tensor([0.0548, 0.0067, -0.1543])
-    assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_cond, atol=1e-3)
     expected_pooled_output = torch.tensor([0.2208, -0.7577, -0.1391])
+    if "refined" in checkpoint_path:
+        expected_masks_slice = torch.tensor(
+            [[0.0095, 0.2114, -0.0486], [0.0019, -0.0304, 0.0527], [0.1598, 0.0943, 0.0699]]
+        )
+    else:
+        expected_masks_slice = torch.tensor(
+            [[-4.1992, -4.1912, -4.1523], [-4.1509, -4.1442, -4.1091], [-4.0581, -4.0355, -4.0107]]
+        )
+
+    assert torch.allclose(outputs.predicted_masks[0, 0, :3, :3], expected_masks_slice, atol=1e-3)
+    assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_cond, atol=1e-3)
     assert torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3)
     print("Looks ok!")
 
@@ -212,7 +218,7 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_t
     # Required parameters
     parser.add_argument(
         "--checkpoint_path",
-        default="/Users/nielsrogge/Documents/CLIPSeg/test.pth",
+        default="/Users/nielsrogge/Documents/CLIPSeg/clip_plus_rd64-uni.pth",
         type=str,
         help="Path to the original checkpoint.",
     )

From 9ebe436fbf9ef7c752d95a8022abaab77c53fb06 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 1 Nov 2022 15:16:47 +0100
Subject: [PATCH 23/47] Fix refined checkpoint

---
 .../models/clipseg/configuration_clipseg.py   |  5 ++++
 .../convert_clipseg_original_pytorch_to_hf.py | 15 ++++++++----
 .../models/clipseg/modeling_clipseg.py        | 24 ++++++++++++++++---
 3 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index b532ac5372506..a0f49463059da 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -283,6 +283,9 @@ class CLIPSegConfig(PretrainedConfig):
             Dimensionality of the "intermediate" (i.e., feed-forward) layers in the Transformer decoder.
         conditional_layer (`int`, *optional*, defaults to 0):
             ...
+        use_complex_transposed_convolution (`bool`, *optional*, defaults to `False`):
+            Whether to use a more complex transposed convolution in the decoder, enabling more fine-grained
+            segmentation.
         kwargs (*optional*):
             Dictionary of keyword arguments.
 
@@ -325,6 +328,7 @@ def __init__(
         decoder_hidden_act="quick_gelu",
         decoder_intermediate_size=2048,
         conditional_layer=0,
+        use_complex_transposed_convolution=False,
         **kwargs
     ):
         super().__init__(text_config_dict=text_config_dict, vision_config_dict=vision_config_dict, **kwargs)
@@ -350,6 +354,7 @@ def __init__(
         self.decoder_intermediate_size = decoder_intermediate_size
         self.conditional_layer = conditional_layer
         self.initializer_factor = 1.0
+        self.use_complex_transposed_convolution = use_complex_transposed_convolution
 
     @classmethod
     def from_text_vision_configs(cls, text_config: CLIPSegTextConfig, vision_config: CLIPSegVisionConfig, **kwargs):
diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index ae69a11470262..01dd1d319ddec 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -24,10 +24,15 @@
 from transformers import CLIPSegConfig, CLIPSegForImageSegmentation, CLIPSegTextConfig, CLIPSegVisionConfig
 
 
-def get_clipseg_config():
+def get_clipseg_config(checkpoint_path):
     text_config = CLIPSegTextConfig()
     vision_config = CLIPSegVisionConfig(patch_size=16)
-    config = CLIPSegConfig.from_text_vision_configs(text_config, vision_config)
+
+    use_complex_transposed_convolution = True if "refined" in checkpoint_path else False
+
+    config = CLIPSegConfig.from_text_vision_configs(
+        text_config, vision_config, use_complex_transposed_convolution=use_complex_transposed_convolution
+    )
     return config
 
 
@@ -151,7 +156,7 @@ def convert_state_dict(orig_state_dict, config):
 
 
 def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_hub):
-    config = get_clipseg_config()
+    config = get_clipseg_config(checkpoint_path)
     model = CLIPSegForImageSegmentation(config)
     model.eval()
 
@@ -184,12 +189,14 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_t
     with torch.no_grad():
         outputs = model(input_ids, pixel_values)
 
+    print(outputs.predicted_masks[0, 0, :3, :3])
+
     # verify values
     expected_cond = torch.tensor([0.0548, 0.0067, -0.1543])
     expected_pooled_output = torch.tensor([0.2208, -0.7577, -0.1391])
     if "refined" in checkpoint_path:
         expected_masks_slice = torch.tensor(
-            [[0.0095, 0.2114, -0.0486], [0.0019, -0.0304, 0.0527], [0.1598, 0.0943, 0.0699]]
+            [[-6.8533, -6.8308, -6.6634], [-6.7272, -6.4926, -6.4597], [-6.4338, -6.2161, -6.2296]]
         )
     else:
         expected_masks_slice = torch.tensor(
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index e481d4ea7901b..3c403e2cdb177 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -1210,9 +1210,27 @@ def __init__(self, config: CLIPSegConfig):
         # TODO remove, this is probably not used
         self.reduce = nn.Linear(config.vision_config.hidden_size, config.reduce_dim)
 
-        self.transposed_convolution = nn.ConvTranspose2d(
-            config.reduce_dim, 1, config.vision_config.patch_size, stride=config.vision_config.patch_size
-        )
+        if config.use_complex_transposed_convolution:
+            transposed_kernels = (config.vision_config.patch_size // 4, config.vision_config.patch_size // 4)
+
+            self.transposed_convolution = nn.Sequential(
+                nn.Conv2d(config.reduce_dim, config.reduce_dim, kernel_size=3, padding=1),
+                nn.ReLU(),
+                nn.ConvTranspose2d(
+                    config.reduce_dim,
+                    config.reduce_dim // 2,
+                    kernel_size=transposed_kernels[0],
+                    stride=transposed_kernels[0],
+                ),
+                nn.ReLU(),
+                nn.ConvTranspose2d(
+                    config.reduce_dim // 2, 1, kernel_size=transposed_kernels[1], stride=transposed_kernels[1]
+                ),
+            )
+        else:
+            self.transposed_convolution = nn.ConvTranspose2d(
+                config.reduce_dim, 1, config.vision_config.patch_size, stride=config.vision_config.patch_size
+            )
 
         depth = len(config.extract_layers)
         self.reduces = nn.ModuleList(

From b68955559a49615fe2402fe3fa04f4c44d5ad13a Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 1 Nov 2022 15:19:35 +0100
Subject: [PATCH 24/47] Remove unused parameter

---
 .../models/clipseg/convert_clipseg_original_pytorch_to_hf.py   | 2 +-
 src/transformers/models/clipseg/modeling_clipseg.py            | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 01dd1d319ddec..21a42697489ea 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -173,7 +173,7 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_t
 
     if missing_keys != ["clipseg.text_model.embeddings.position_ids", "clipseg.vision_model.embeddings.position_ids"]:
         raise ValueError("Missing keys that are not expected: {}".format(missing_keys))
-    if len(unexpected_keys) > 0:
+    if unexpected_keys != ["decoder.reduce.weight", "decoder.reduce.bias"]:
         raise ValueError(f"Unexpected keys: {unexpected_keys}")
 
     # TODO create feature extractor
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 3c403e2cdb177..33ff255d65966 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -1207,9 +1207,6 @@ def __init__(self, config: CLIPSegConfig):
         self.film_mul = nn.Linear(config.projection_dim, config.reduce_dim)
         self.film_add = nn.Linear(config.projection_dim, config.reduce_dim)
 
-        # TODO remove, this is probably not used
-        self.reduce = nn.Linear(config.vision_config.hidden_size, config.reduce_dim)
-
         if config.use_complex_transposed_convolution:
             transposed_kernels = (config.vision_config.patch_size // 4, config.vision_config.patch_size // 4)
 

From fae27ad708a541201980f0e8b0c84578ca28efa1 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 1 Nov 2022 15:35:33 +0100
Subject: [PATCH 25/47] Improve conversion script

---
 .../convert_clipseg_original_pytorch_to_hf.py | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 21a42697489ea..42826ceb7b0be 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -24,11 +24,11 @@
 from transformers import CLIPSegConfig, CLIPSegForImageSegmentation, CLIPSegTextConfig, CLIPSegVisionConfig
 
 
-def get_clipseg_config(checkpoint_path):
+def get_clipseg_config(model_name):
     text_config = CLIPSegTextConfig()
     vision_config = CLIPSegVisionConfig(patch_size=16)
 
-    use_complex_transposed_convolution = True if "refined" in checkpoint_path else False
+    use_complex_transposed_convolution = True if "refined" in model_name else False
 
     config = CLIPSegConfig.from_text_vision_configs(
         text_config, vision_config, use_complex_transposed_convolution=use_complex_transposed_convolution
@@ -155,8 +155,8 @@ def convert_state_dict(orig_state_dict, config):
 )
 
 
-def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_hub):
-    config = get_clipseg_config(checkpoint_path)
+def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub):
+    config = get_clipseg_config(model_name)
     model = CLIPSegForImageSegmentation(config)
     model.eval()
 
@@ -194,7 +194,7 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_t
     # verify values
     expected_cond = torch.tensor([0.0548, 0.0067, -0.1543])
     expected_pooled_output = torch.tensor([0.2208, -0.7577, -0.1391])
-    if "refined" in checkpoint_path:
+    if "refined" in model_name:
         expected_masks_slice = torch.tensor(
             [[-6.8533, -6.8308, -6.6634], [-6.7272, -6.4926, -6.4597], [-6.4338, -6.2161, -6.2296]]
         )
@@ -216,18 +216,25 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_t
         # feature_extractor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        print("Pushing model to the hub")
-        model.push_to_hub("nielsr/clipseg-test")
+        print(f"Pushing model {model_name} to the hub")
+        model.push_to_hub(f"nielsr/{model_name}")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="clipseg",
+        type=str,
+        choices=['clipseg', 'clipseg-rd16', 'clipseg-rd64-refined'],
+        help="Name of the model. Supported models are: clipseg-rd64, clipseg-rd16 and clipseg-rd64-refined (rd meaning reduce dimension)",
+    )
     parser.add_argument(
         "--checkpoint_path",
         default="/Users/nielsrogge/Documents/CLIPSeg/clip_plus_rd64-uni.pth",
         type=str,
-        help="Path to the original checkpoint.",
+        help="Path to the original checkpoint. Note that the script assumes that the checkpoint includes both CLIP and the decoder weights.",
     )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
@@ -237,4 +244,4 @@ def convert_clipseg_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_t
     )
 
     args = parser.parse_args()
-    convert_clipseg_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
+    convert_clipseg_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
\ No newline at end of file

From 174d631a8dba450dfbf24aeb57bc373e0bb0c149 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 2 Nov 2022 16:42:46 +0100
Subject: [PATCH 26/47] Add support for training

---
 .../convert_clipseg_original_pytorch_to_hf.py | 14 ++++++---
 .../models/clipseg/modeling_clipseg.py        |  5 ++--
 tests/models/clipseg/test_modeling_clipseg.py | 29 ++++++++++++-------
 3 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 42826ceb7b0be..0cbf7ac74c955 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -227,14 +227,20 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
         "--model_name",
         default="clipseg",
         type=str,
-        choices=['clipseg', 'clipseg-rd16', 'clipseg-rd64-refined'],
-        help="Name of the model. Supported models are: clipseg-rd64, clipseg-rd16 and clipseg-rd64-refined (rd meaning reduce dimension)",
+        choices=["clipseg", "clipseg-rd16", "clipseg-rd64-refined"],
+        help=(
+            "Name of the model. Supported models are: clipseg-rd64, clipseg-rd16 and clipseg-rd64-refined (rd meaning"
+            " reduce dimension)"
+        ),
     )
     parser.add_argument(
         "--checkpoint_path",
         default="/Users/nielsrogge/Documents/CLIPSeg/clip_plus_rd64-uni.pth",
         type=str,
-        help="Path to the original checkpoint. Note that the script assumes that the checkpoint includes both CLIP and the decoder weights.",
+        help=(
+            "Path to the original checkpoint. Note that the script assumes that the checkpoint includes both CLIP and"
+            " the decoder weights."
+        ),
     )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
@@ -244,4 +250,4 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     )
 
     args = parser.parse_args()
-    convert_clipseg_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
\ No newline at end of file
+    convert_clipseg_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 33ff255d65966..823f1407548f9 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -1365,15 +1365,14 @@ def forward(
                     " `config.projection_dim`."
                 )
 
-        predicted_masks = self.decoder(activations, conditional_embeddings)
+        predicted_masks = self.decoder(activations, conditional_embeddings).squeeze()
 
         if output_hidden_states:
             raise NotImplementedError("To do")
 
         loss = None
         if labels is not None:
-            # TODO check whether this is correct
-            loss_fn = nn.BCELoss()
+            loss_fn = nn.BCEWithLogitsLoss()
             loss = loss_fn(predicted_masks, labels)
 
         if not return_dict:
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 5baaa83ef3a56..46b21320ade10 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -389,7 +389,6 @@ def create_and_check_model_for_image_segmentation(self, config, input_ids, atten
             result.predicted_masks.shape,
             (
                 self.vision_model_tester.batch_size,
-                1,
                 self.vision_model_tester.image_size,
                 self.vision_model_tester.image_size,
             ),
@@ -425,6 +424,17 @@ class CLIPSegModelTest(ModelTesterMixin, unittest.TestCase):
     test_resize_embeddings = False
     test_attention_outputs = False
 
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        # CLIPSegForImageSegmentation requires special treatment
+        if return_labels:
+            if model_class.__name__ == "CLIPSegForImageSegmentation":
+                batch_size, _, height, width = inputs_dict["pixel_values"].shape
+                inputs_dict["labels"] = torch.zeros(
+                    [batch_size, height, width], device=torch_device, dtype=torch.float
+                )
+
+        return inputs_dict
+
     def setUp(self):
         self.model_tester = CLIPSegModelTester(self)
 
@@ -704,10 +714,11 @@ def prepare_img():
 @require_torch
 class CLIPSegModelIntegrationTest(unittest.TestCase):
     @slow
-    def test_inference(self):
-        model_name = "organization/clipseg-rd64-uni"
-        model = CLIPSegModel.from_pretrained(model_name).to(torch_device)
+    def test_inference_image_segmentation(self):
+        # TODO update to appropriate organization
+        model_name = "nielsr/clipseg-rd64-refined"
         processor = CLIPProcessor.from_pretrained(model_name)
+        model = CLIPSegModel.from_pretrained(model_name).to(torch_device)
 
         image = prepare_img()
         inputs = processor(
@@ -718,16 +729,12 @@ def test_inference(self):
         with torch.no_grad():
             outputs = model(**inputs)
 
-        # verify the logits
+        # verify the predicted masks
         self.assertEqual(
-            outputs.logits_per_image.shape,
+            outputs.predicted_masks.shape,
             torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
         )
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
-        )
 
         expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)
 
-        self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
+        self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
\ No newline at end of file

From 23d78a767b205263e9e32f1b6ebb80fb67dd6451 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 2 Nov 2022 16:46:12 +0100
Subject: [PATCH 27/47] Fix conversion script

---
 .../models/clipseg/convert_clipseg_original_pytorch_to_hf.py  | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 0cbf7ac74c955..e40e1efea7b1b 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -189,8 +189,6 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     with torch.no_grad():
         outputs = model(input_ids, pixel_values)
 
-    print(outputs.predicted_masks[0, 0, :3, :3])
-
     # verify values
     expected_cond = torch.tensor([0.0548, 0.0067, -0.1543])
     expected_pooled_output = torch.tensor([0.2208, -0.7577, -0.1391])
@@ -203,7 +201,7 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
             [[-4.1992, -4.1912, -4.1523], [-4.1509, -4.1442, -4.1091], [-4.0581, -4.0355, -4.0107]]
         )
 
-    assert torch.allclose(outputs.predicted_masks[0, 0, :3, :3], expected_masks_slice, atol=1e-3)
+    assert torch.allclose(outputs.predicted_masks[0, :3, :3], expected_masks_slice, atol=1e-3)
     assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_cond, atol=1e-3)
     assert torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3)
     print("Looks ok!")

From 207ab72006d9a1d2cff967ab35d55098ee141d58 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 2 Nov 2022 18:58:24 +0100
Subject: [PATCH 28/47] Add CLIPSegFeatureExtractor

---
 docs/source/en/model_doc/clipseg.mdx          |  10 +-
 src/transformers/__init__.py                  |   2 +
 src/transformers/models/clipseg/__init__.py   |  21 +-
 .../clipseg/feature_extraction_clipseg.py     |  25 ++
 .../clipseg/image_processing_clipseg.py       | 279 ++++++++++++++++++
 .../models/clipseg/processing_clipseg.py      | 107 +++++++
 .../utils/dummy_vision_objects.py             |   7 +
 tests/models/clipseg/test_modeling_clipseg.py |   2 +-
 8 files changed, 446 insertions(+), 7 deletions(-)
 create mode 100644 src/transformers/models/clipseg/feature_extraction_clipseg.py
 create mode 100644 src/transformers/models/clipseg/image_processing_clipseg.py
 create mode 100644 src/transformers/models/clipseg/processing_clipseg.py

diff --git a/docs/source/en/model_doc/clipseg.mdx b/docs/source/en/model_doc/clipseg.mdx
index 69aa678d16fd2..5f58e15268007 100644
--- a/docs/source/en/model_doc/clipseg.mdx
+++ b/docs/source/en/model_doc/clipseg.mdx
@@ -39,13 +39,13 @@ to generalized queries involving affordances or properties*
 
 Tips:
 
-- [`CLIPSegForImageSegmentation`] adds the decoder on top of [`CLIPSegModel`]. The latter is identical to [`CLIPModel`].
+- [`CLIPSegForImageSegmentation`] adds a decoder on top of [`CLIPSegModel`]. The latter is identical to [`CLIPModel`].
 - [`CLIPSegForImageSegmentation`] can generate image segmentations based on arbitrary prompts at test time. A prompt can be either a text
-(provided to the model as `input_ids`) or an image (provided to the model as `conditional_pixel_values`). One can also provide a custom
+(provided to the model as `input_ids`) or an image (provided to the model as `conditional_pixel_values`). One can also provide custom
 conditional embeddings (provided to the model as `conditional_embeddings`).
 
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+The original code can be found [here](https://github.com/timojl/clipseg).
 
 
 ## CLIPSegConfig
@@ -61,6 +61,10 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 [[autodoc]] CLIPSegVisionConfig
 
+## CLIPSegFeatureExtractor
+
+[[autodoc]] CLIPSegFeatureExtractor
+
 ## CLIPSegModel
 
 [[autodoc]] CLIPSegModel
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 17932859a6a98..2e23b32bdabca 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -692,6 +692,7 @@
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].append("BeitFeatureExtractor")
     _import_structure["models.clip"].append("CLIPFeatureExtractor")
+    _import_structure["models.clipseg"].append("CLIPSegFeatureExtractor")
     _import_structure["models.convnext"].append("ConvNextFeatureExtractor")
     _import_structure["models.deformable_detr"].append("DeformableDetrFeatureExtractor")
     _import_structure["models.deit"].append("DeiTFeatureExtractor")
@@ -3700,6 +3701,7 @@
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
         from .models.clip import CLIPFeatureExtractor
+        from .models.clipseg import CLIPSegFeatureExtractor
         from .models.conditional_detr import ConditionalDetrFeatureExtractor
         from .models.convnext import ConvNextFeatureExtractor
         from .models.deformable_detr import DeformableDetrFeatureExtractor
diff --git a/src/transformers/models/clipseg/__init__.py b/src/transformers/models/clipseg/__init__.py
index 2e277f1385528..97c641b7c0074 100644
--- a/src/transformers/models/clipseg/__init__.py
+++ b/src/transformers/models/clipseg/__init__.py
@@ -17,14 +17,13 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {
     "configuration_clipseg": [
         "CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "CLIPSegConfig",
-        "CLIPSegOnnxConfig",
         "CLIPSegTextConfig",
         "CLIPSegVisionConfig",
     ],
@@ -45,11 +44,18 @@
         "CLIPSegForImageSegmentation",
     ]
 
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["feature_extraction_clipseg"] = ["CLIPSegFeatureExtractor"]
+
 if TYPE_CHECKING:
     from .configuration_clipseg import (
         CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CLIPSegConfig,
-        CLIPSegOnnxConfig,
         CLIPSegTextConfig,
         CLIPSegVisionConfig,
     )
@@ -69,6 +75,15 @@
             CLIPSegVisionModel,
         )
 
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_clipseg import CLIPSegFeatureExtractor
+
+
 else:
     import sys
 
diff --git a/src/transformers/models/clipseg/feature_extraction_clipseg.py b/src/transformers/models/clipseg/feature_extraction_clipseg.py
new file mode 100644
index 0000000000000..78aa288d3d00b
--- /dev/null
+++ b/src/transformers/models/clipseg/feature_extraction_clipseg.py
@@ -0,0 +1,25 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for ViT."""
+
+from ...utils import logging
+from .image_processing_clipseg import CLIPSegImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+# Feature extractor for CLIPSeg is being replaced by image processor
+CLIPSegFeatureExtractor = CLIPSegImageProcessor
\ No newline at end of file
diff --git a/src/transformers/models/clipseg/image_processing_clipseg.py b/src/transformers/models/clipseg/image_processing_clipseg.py
new file mode 100644
index 0000000000000..c52e1c650b523
--- /dev/null
+++ b/src/transformers/models/clipseg/image_processing_clipseg.py
@@ -0,0 +1,279 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for CLIPSeg."""
+
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+
+from transformers.utils.generic import TensorType
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    is_batched,
+    to_numpy_array,
+    valid_images,
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+)
+from ...utils import logging
+from ...utils.import_utils import is_vision_available
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+class CLIPSegImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a CLIPSeg image processor.
+
+    Args:
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize:
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Image standard deviation.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size, default_to_square=False)
+
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample 
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            image_mean (`float` or `List[float]`):
+                Image mean.
+            image_std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+        
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample:
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        return resize(
+            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
+        )
+    
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+        """
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        resample = resample if resample is not None else self.resample
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+        
+        if do_resize:
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]    
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
\ No newline at end of file
diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
new file mode 100644
index 0000000000000..a02cbf15b9800
--- /dev/null
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -0,0 +1,107 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for CLIPSeg
+"""
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding
+
+
+class CLIPProcessor(ProcessorMixin):
+    r"""
+    Constructs a CLIP processor which wraps a CLIPSeg feature extractor and a CLIP tokenizer into a single processor.
+
+    [`CLIPProcessor`] offers all the functionalities of [`CLIPSegFeatureExtractor`] and [`CLIPTokenizerFast`]. See the
+    [`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor ([`CLIPSegFeatureExtractor`]):
+            The feature extractor is a required input.
+        tokenizer ([`CLIPTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+    feature_extractor_class = "CLIPSegFeatureExtractor"
+    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+
+    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPSegFeatureExtractor's [`~CLIPSegFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
+        doctsring of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+
+        if images is not None:
+            image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
\ No newline at end of file
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index a3112c4454b4b..4084159d49b4c 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -43,6 +43,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class CLIPSegFeatureExtractor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ConditionalDetrFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 46b21320ade10..30e6128485384 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -737,4 +737,4 @@ def test_inference_image_segmentation(self):
 
         expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)
 
-        self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
\ No newline at end of file
+        self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))

From 09b7e031a8fcc4e4decbe2546de6e64dd4c36b05 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 2 Nov 2022 19:17:15 +0100
Subject: [PATCH 29/47] Fix processor

---
 src/transformers/__init__.py                  |  4 ++-
 src/transformers/models/clipseg/__init__.py   |  5 ++++
 .../convert_clipseg_original_pytorch_to_hf.py | 29 ++++++++-----------
 .../clipseg/image_processing_clipseg.py       |  4 +--
 .../models/clipseg/processing_clipseg.py      |  8 ++---
 src/transformers/processing_utils.py          |  2 +-
 6 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 2e23b32bdabca..98c5c2ee387c8 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -176,6 +176,7 @@
         "CLIPSegConfig",
         "CLIPSegTextConfig",
         "CLIPSegVisionConfig",
+        "CLIPSegProcessor",
     ],
     "models.codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenTokenizer"],
     "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDetrConfig"],
@@ -3237,16 +3238,17 @@
     from .models.clip import (
         CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CLIPConfig,
-        CLIPProcessor,
         CLIPTextConfig,
         CLIPTokenizer,
         CLIPVisionConfig,
+        CLIPProcessor,
     )
     from .models.clipseg import (
         CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CLIPSegConfig,
         CLIPSegTextConfig,
         CLIPSegVisionConfig,
+        CLIPSegProcessor,
     )
     from .models.codegen import CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP, CodeGenConfig, CodeGenTokenizer
     from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDetrConfig
diff --git a/src/transformers/models/clipseg/__init__.py b/src/transformers/models/clipseg/__init__.py
index 97c641b7c0074..1943e36a19f66 100644
--- a/src/transformers/models/clipseg/__init__.py
+++ b/src/transformers/models/clipseg/__init__.py
@@ -27,6 +27,9 @@
         "CLIPSegTextConfig",
         "CLIPSegVisionConfig",
     ],
+    "processing_clipseg": [
+        "CLIPSegProcessor",
+    ],
 }
 
 try:
@@ -60,6 +63,8 @@
         CLIPSegVisionConfig,
     )
 
+    from .processing_clipseg import CLIPSegProcessor
+
     try:
         if not is_torch_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index e40e1efea7b1b..226b902fa2f44 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -21,7 +21,8 @@
 from PIL import Image
 from torchvision.transforms import Compose, Resize, ToTensor
 
-from transformers import CLIPSegConfig, CLIPSegForImageSegmentation, CLIPSegTextConfig, CLIPSegVisionConfig
+from transformers import CLIPSegConfig, CLIPSegFeatureExtractor, CLIPTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation, CLIPSegTextConfig, CLIPSegVisionConfig
+from transformers.models.clip.processing_clip import CLIPProcessor
 
 
 def get_clipseg_config(model_name):
@@ -147,14 +148,6 @@ def convert_state_dict(orig_state_dict, config):
     return orig_state_dict
 
 
-image_transforms = Compose(
-    [
-        ToTensor(),
-        Resize((352, 352)),
-    ]
-)
-
-
 def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub):
     config = get_clipseg_config(model_name)
     model = CLIPSegForImageSegmentation(config)
@@ -176,18 +169,19 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     if unexpected_keys != ["decoder.reduce.weight", "decoder.reduce.bias"]:
         raise ValueError(f"Unexpected keys: {unexpected_keys}")
 
-    # TODO create feature extractor
-    # feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(model_name.replace("_", "-")))
+    feature_extractor = CLIPSegFeatureExtractor()
     image = Image.open("/Users/nielsrogge/Documents/cats.jpg").convert("RGB")
-    pixel_values = image_transforms(image).unsqueeze(0).repeat(4, 1, 1, 1)
+    pixel_values = feature_extractor(image, return_tensors="pt").pixel_values
 
-    # prompts = ["a glass", "something to fill", "wood", "a jar"]
-    # tokenizer = CLIPTokenizer.from_pretrained("openai/")
-    # input_ids = CLIPTokenizer(prompts, padding="max_length", return_tensors="pt")
+    prompts = ["a glass", "something to fill", "wood", "a jar"]
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+    input_ids = tokenizer(prompts, padding="max_length", return_tensors="pt").input_ids
     input_ids = torch.tensor([[1, 2] + [9] * 75]).repeat(4, 1)
 
+    processor = CLIPProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+
     with torch.no_grad():
-        outputs = model(input_ids, pixel_values)
+        outputs = model(input_ids, pixel_values.repeat(len(prompts), 1, 1, 1))
 
     # verify values
     expected_cond = torch.tensor([0.0548, 0.0067, -0.1543])
@@ -214,8 +208,9 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
         # feature_extractor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        print(f"Pushing model {model_name} to the hub")
+        print(f"Pushing model and processor for {model_name} to the hub")
         model.push_to_hub(f"nielsr/{model_name}")
+        processor.push_to_hub(f"nielsr/{model_name}")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/clipseg/image_processing_clipseg.py b/src/transformers/models/clipseg/image_processing_clipseg.py
index c52e1c650b523..fc0f6bf9dbbdb 100644
--- a/src/transformers/models/clipseg/image_processing_clipseg.py
+++ b/src/transformers/models/clipseg/image_processing_clipseg.py
@@ -69,7 +69,7 @@ class CLIPSegImageProcessor(BaseImageProcessor):
         do_resize (`bool`, *optional*, defaults to `True`):
             Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
             size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
-        size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
+        size (`dict`, *optional*, defaults to `{"height": 352, "width": 352}`):
             Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
             method.
         resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
@@ -92,7 +92,7 @@ def __init__(
         **kwargs
     ) -> None:
         super().__init__(**kwargs)
-        size = size if size is not None else {"height": 224, "width": 224}
+        size = size if size is not None else {"height": 352, "width": 352}
         size = get_size_dict(size, default_to_square=False)
 
         self.do_rescale = do_rescale
diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index a02cbf15b9800..91f91205c403d 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -19,12 +19,12 @@
 from ...tokenization_utils_base import BatchEncoding
 
 
-class CLIPProcessor(ProcessorMixin):
+class CLIPSegProcessor(ProcessorMixin):
     r"""
-    Constructs a CLIP processor which wraps a CLIPSeg feature extractor and a CLIP tokenizer into a single processor.
+    Constructs a CLIPSeg processor which wraps a CLIPSeg feature extractor and a CLIP tokenizer into a single processor.
 
-    [`CLIPProcessor`] offers all the functionalities of [`CLIPSegFeatureExtractor`] and [`CLIPTokenizerFast`]. See the
-    [`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information.
+    [`CLIPSegProcessor`] offers all the functionalities of [`CLIPSegFeatureExtractor`] and [`CLIPTokenizerFast`]. See the
+    [`~CLIPSegProcessor.__call__`] and [`~CLIPSegProcessor.decode`] for more information.
 
     Args:
         feature_extractor ([`CLIPSegFeatureExtractor`]):
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 569e9975b16c9..5cc41c447b7bd 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -56,7 +56,7 @@ def __init__(self, *args, **kwargs):
         # Sanitize args and kwargs
         for key in kwargs:
             if key not in self.attributes:
-                raise TypeError(f"Unexepcted keyword argument {key}.")
+                raise TypeError(f"Unexpected keyword argument {key}.")
         for arg, attribute_name in zip(args, self.attributes):
             if attribute_name in kwargs:
                 raise TypeError(f"Got multiple values for argument {attribute_name}.")

From 751f6e4718a578f51e260daa8dae08ebc7075efa Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 4 Nov 2022 10:32:50 +0100
Subject: [PATCH 30/47] Fix CLIPSegProcessor

---
 README_ja.md                                  |   2 +
 docs/source/en/model_doc/clipseg.mdx          |   9 +-
 src/transformers/__init__.py                  |   8 +-
 .../models/auto/feature_extraction_auto.py    |   1 -
 src/transformers/models/clipseg/__init__.py   |  24 +-
 .../convert_clipseg_original_pytorch_to_hf.py |  33 ++-
 .../clipseg/feature_extraction_clipseg.py     |  25 --
 .../clipseg/image_processing_clipseg.py       | 279 ------------------
 .../models/clipseg/processing_clipseg.py      |  13 +-
 .../utils/dummy_vision_objects.py             |   7 -
 tests/models/clipseg/test_modeling_clipseg.py |   6 +-
 .../models/clipseg/test_processor_clipseg.py  | 189 ++++++++++++
 12 files changed, 234 insertions(+), 362 deletions(-)
 delete mode 100644 src/transformers/models/clipseg/feature_extraction_clipseg.py
 delete mode 100644 src/transformers/models/clipseg/image_processing_clipseg.py
 create mode 100644 tests/models/clipseg/test_processor_clipseg.py

diff --git a/README_ja.md b/README_ja.md
index eed7d204f8368..3866a7e29952b 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -314,6 +314,8 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [Image Segmentation Using Text and Image Prompts](<FILL ARKIV LINK>) by Timo Lüddecke and Alexander Ecker.
+1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
diff --git a/docs/source/en/model_doc/clipseg.mdx b/docs/source/en/model_doc/clipseg.mdx
index 5f58e15268007..41083ddff2d2d 100644
--- a/docs/source/en/model_doc/clipseg.mdx
+++ b/docs/source/en/model_doc/clipseg.mdx
@@ -44,6 +44,11 @@ Tips:
 (provided to the model as `input_ids`) or an image (provided to the model as `conditional_pixel_values`). One can also provide custom
 conditional embeddings (provided to the model as `conditional_embeddings`).
 
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/clipseg_architecture.png"
+alt="drawing" width="600"/> 
+
+<small> CLIPSeg overview. Taken from the <a href="https://arxiv.org/abs/2112.10003">original paper.</a> </small>
+
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/timojl/clipseg).
 
@@ -61,9 +66,9 @@ The original code can be found [here](https://github.com/timojl/clipseg).
 
 [[autodoc]] CLIPSegVisionConfig
 
-## CLIPSegFeatureExtractor
+## CLIPSegProcessor
 
-[[autodoc]] CLIPSegFeatureExtractor
+[[autodoc]] CLIPSegProcessor
 
 ## CLIPSegModel
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 98c5c2ee387c8..b377395a9046c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -174,9 +174,9 @@
     "models.clipseg": [
         "CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "CLIPSegConfig",
+        "CLIPSegProcessor",
         "CLIPSegTextConfig",
         "CLIPSegVisionConfig",
-        "CLIPSegProcessor",
     ],
     "models.codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenTokenizer"],
     "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDetrConfig"],
@@ -693,7 +693,6 @@
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].append("BeitFeatureExtractor")
     _import_structure["models.clip"].append("CLIPFeatureExtractor")
-    _import_structure["models.clipseg"].append("CLIPSegFeatureExtractor")
     _import_structure["models.convnext"].append("ConvNextFeatureExtractor")
     _import_structure["models.deformable_detr"].append("DeformableDetrFeatureExtractor")
     _import_structure["models.deit"].append("DeiTFeatureExtractor")
@@ -3238,17 +3237,17 @@
     from .models.clip import (
         CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CLIPConfig,
+        CLIPProcessor,
         CLIPTextConfig,
         CLIPTokenizer,
         CLIPVisionConfig,
-        CLIPProcessor,
     )
     from .models.clipseg import (
         CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CLIPSegConfig,
+        CLIPSegProcessor,
         CLIPSegTextConfig,
         CLIPSegVisionConfig,
-        CLIPSegProcessor,
     )
     from .models.codegen import CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP, CodeGenConfig, CodeGenTokenizer
     from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDetrConfig
@@ -3703,7 +3702,6 @@
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
         from .models.clip import CLIPFeatureExtractor
-        from .models.clipseg import CLIPSegFeatureExtractor
         from .models.conditional_detr import ConditionalDetrFeatureExtractor
         from .models.convnext import ConvNextFeatureExtractor
         from .models.deformable_detr import DeformableDetrFeatureExtractor
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index ac98d57e4ff8f..76d38f95ab151 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,7 +39,6 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
-        ("clipseg", "CLIPSegFeatureExtractor"),
         ("conditional_detr", "ConditionalDetrFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
diff --git a/src/transformers/models/clipseg/__init__.py b/src/transformers/models/clipseg/__init__.py
index 1943e36a19f66..f6b09b9af9757 100644
--- a/src/transformers/models/clipseg/__init__.py
+++ b/src/transformers/models/clipseg/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
@@ -27,9 +27,7 @@
         "CLIPSegTextConfig",
         "CLIPSegVisionConfig",
     ],
-    "processing_clipseg": [
-        "CLIPSegProcessor",
-    ],
+    "processing_clipseg": ["CLIPSegProcessor"],
 }
 
 try:
@@ -47,14 +45,6 @@
         "CLIPSegForImageSegmentation",
     ]
 
-try:
-    if not is_vision_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["feature_extraction_clipseg"] = ["CLIPSegFeatureExtractor"]
-
 if TYPE_CHECKING:
     from .configuration_clipseg import (
         CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -62,7 +52,6 @@
         CLIPSegTextConfig,
         CLIPSegVisionConfig,
     )
-
     from .processing_clipseg import CLIPSegProcessor
 
     try:
@@ -80,15 +69,6 @@
             CLIPSegVisionModel,
         )
 
-    try:
-        if not is_vision_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .feature_extraction_clipseg import CLIPSegFeatureExtractor
-
-
 else:
     import sys
 
diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 226b902fa2f44..c26bda6308544 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -16,13 +16,20 @@
 """Convert CLIPSeg checkpoints from the original repository. URL: https://github.com/timojl/clipseg."""
 
 import argparse
+import requests
 
 import torch
 from PIL import Image
-from torchvision.transforms import Compose, Resize, ToTensor
 
-from transformers import CLIPSegConfig, CLIPSegFeatureExtractor, CLIPTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation, CLIPSegTextConfig, CLIPSegVisionConfig
-from transformers.models.clip.processing_clip import CLIPProcessor
+from transformers import (
+    CLIPSegConfig,
+    CLIPSegForImageSegmentation,
+    CLIPSegProcessor,
+    CLIPSegTextConfig,
+    CLIPSegVisionConfig,
+    CLIPTokenizer,
+    ViTFeatureExtractor,
+)
 
 
 def get_clipseg_config(model_name):
@@ -148,6 +155,12 @@ def convert_state_dict(orig_state_dict, config):
     return orig_state_dict
 
 
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+
 def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub):
     config = get_clipseg_config(model_name)
     model = CLIPSegForImageSegmentation(config)
@@ -169,16 +182,14 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     if unexpected_keys != ["decoder.reduce.weight", "decoder.reduce.bias"]:
         raise ValueError(f"Unexpected keys: {unexpected_keys}")
 
-    feature_extractor = CLIPSegFeatureExtractor()
-    image = Image.open("/Users/nielsrogge/Documents/cats.jpg").convert("RGB")
-    pixel_values = feature_extractor(image, return_tensors="pt").pixel_values
-
-    prompts = ["a glass", "something to fill", "wood", "a jar"]
+    feature_extractor = ViTFeatureExtractor(size=352)
     tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-    input_ids = tokenizer(prompts, padding="max_length", return_tensors="pt").input_ids
-    input_ids = torch.tensor([[1, 2] + [9] * 75]).repeat(4, 1)
+    processor = CLIPSegProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+
+    image = prepare_img()
+    text = ["a glass", "something to fill", "wood", "a jar"]
 
-    processor = CLIPProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+    inputs = processor(text=text, images=image, padding="max_length", return_tensors="pt")
 
     with torch.no_grad():
         outputs = model(input_ids, pixel_values.repeat(len(prompts), 1, 1, 1))
diff --git a/src/transformers/models/clipseg/feature_extraction_clipseg.py b/src/transformers/models/clipseg/feature_extraction_clipseg.py
deleted file mode 100644
index 78aa288d3d00b..0000000000000
--- a/src/transformers/models/clipseg/feature_extraction_clipseg.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Feature extractor class for ViT."""
-
-from ...utils import logging
-from .image_processing_clipseg import CLIPSegImageProcessor
-
-
-logger = logging.get_logger(__name__)
-
-
-# Feature extractor for CLIPSeg is being replaced by image processor
-CLIPSegFeatureExtractor = CLIPSegImageProcessor
\ No newline at end of file
diff --git a/src/transformers/models/clipseg/image_processing_clipseg.py b/src/transformers/models/clipseg/image_processing_clipseg.py
deleted file mode 100644
index fc0f6bf9dbbdb..0000000000000
--- a/src/transformers/models/clipseg/image_processing_clipseg.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Image processor class for CLIPSeg."""
-
-from typing import Any, Dict, List, Optional, Union
-
-import numpy as np
-
-from transformers.utils.generic import TensorType
-
-from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import (
-    normalize,
-    rescale,
-    resize,
-    to_channel_dimension_format,
-)
-from ...image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    is_batched,
-    to_numpy_array,
-    valid_images,
-    IMAGENET_STANDARD_MEAN,
-    IMAGENET_STANDARD_STD,
-)
-from ...utils import logging
-from ...utils.import_utils import is_vision_available
-
-
-logger = logging.get_logger(__name__)
-
-
-if is_vision_available():
-    import PIL
-
-
-class CLIPSegImageProcessor(BaseImageProcessor):
-    r"""
-    Constructs a CLIPSeg image processor.
-
-    Args:
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
-            the `preprocess` method.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
-            method.
-        do_normalize:
-            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
-            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
-            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
-            Image standard deviation.
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
-            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
-        size (`dict`, *optional*, defaults to `{"height": 352, "width": 352}`):
-            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
-            method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
-            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
-            `preprocess` method.
-    """
-
-    model_input_names = ["pixel_values"]
-
-    def __init__(
-        self,
-        do_rescale: bool = True,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_resize: bool = True,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = PILImageResampling.BILINEAR,
-        **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-        size = size if size is not None else {"height": 352, "width": 352}
-        size = get_size_dict(size, default_to_square=False)
-
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
-        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
-        self.do_resize = do_resize
-        self.size = size
-        self.resample = resample 
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
-    def resize(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        resample: PILImageResampling = PILImageResampling.BILINEAR,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Resize an image to `(size["height"], size["width"])`.
-        
-        Args:
-            image (`np.ndarray`):
-                Image to resize.
-            size (`Dict[str, int]`):
-                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
-            resample:
-                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
-            data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-        
-        Returns:
-            `np.ndarray`: The resized image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
-        return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
-        )
-    
-    def preprocess(
-        self,
-        images: ImageInput,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-    ) -> PIL.Image.Image:
-        """
-        Preprocess an image or batch of images.
-
-        Args:
-            images (`ImageInput`):
-                Image to preprocess.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
-                `True`.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
-                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
-                the longest edge resized to keep the input aspect ratio.
-            resample (`int`, *optional*, defaults to `self.resample`):
-                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
-                has an effect if `do_resize` is set to `True`.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                - Unset: Return a list of `np.ndarray`.
-                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: defaults to the channel dimension format of the input image.
-        """
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
-        size = get_size_dict(size, default_to_square=False)
-        resample = resample if resample is not None else self.resample
-
-        if not is_batched(images):
-            images = [images]
-
-        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-
-        if do_rescale and rescale_factor is None:
-            raise ValueError("Rescale factor must be specified if do_rescale is True.")
-
-        if do_normalize and (image_mean is None or image_std is None):
-            raise ValueError("Image mean and std must be specified if do_normalize is True.")
-
-        if do_resize and size is None:
-            raise ValueError("Size must be specified if do_resize is True.")
-
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-
-        if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
-
-        if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
-        
-        if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]    
-
-        images = [to_channel_dimension_format(image, data_format) for image in images]
-
-        data = {"pixel_values": images}
-        return BatchFeature(data=data, tensor_type=return_tensors)
\ No newline at end of file
diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index 91f91205c403d..4a18e4ba7a902 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -21,18 +21,19 @@
 
 class CLIPSegProcessor(ProcessorMixin):
     r"""
-    Constructs a CLIPSeg processor which wraps a CLIPSeg feature extractor and a CLIP tokenizer into a single processor.
+    Constructs a CLIPSeg processor which wraps a CLIPSeg feature extractor and a CLIP tokenizer into a single
+    processor.
 
-    [`CLIPSegProcessor`] offers all the functionalities of [`CLIPSegFeatureExtractor`] and [`CLIPTokenizerFast`]. See the
+    [`CLIPSegProcessor`] offers all the functionalities of [`ViTFeatureExtractor`] and [`CLIPTokenizerFast`]. See the
     [`~CLIPSegProcessor.__call__`] and [`~CLIPSegProcessor.decode`] for more information.
 
     Args:
-        feature_extractor ([`CLIPSegFeatureExtractor`]):
+        feature_extractor ([`ViTFeatureExtractor`]):
             The feature extractor is a required input.
         tokenizer ([`CLIPTokenizerFast`]):
             The tokenizer is a required input.
     """
-    feature_extractor_class = "CLIPSegFeatureExtractor"
+    feature_extractor_class = "ViTFeatureExtractor"
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
 
     def __init__(self, feature_extractor, tokenizer):
@@ -44,7 +45,7 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPSegFeatureExtractor's [`~CLIPSegFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
+        ViTFeatureExtractor's [`~ViTFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
         doctsring of the above two methods for more information.
 
         Args:
@@ -104,4 +105,4 @@ def decode(self, *args, **kwargs):
         This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
         the docstring of this method for more information.
         """
-        return self.tokenizer.decode(*args, **kwargs)
\ No newline at end of file
+        return self.tokenizer.decode(*args, **kwargs)
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 4084159d49b4c..a3112c4454b4b 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -43,13 +43,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class CLIPSegFeatureExtractor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class ConditionalDetrFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 30e6128485384..79062fd1350be 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -24,7 +24,7 @@
 
 import requests
 import transformers
-from transformers import MODEL_MAPPING, CLIPSegConfig, CLIPSegTextConfig, CLIPSegVisionConfig
+from transformers import MODEL_MAPPING, CLIPSegConfig, CLIPSegProcessor, CLIPSegTextConfig, CLIPSegVisionConfig
 from transformers.models.auto import get_values
 from transformers.testing_utils import (
     is_flax_available,
@@ -57,8 +57,6 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import CLIPProcessor
-
 
 if is_flax_available():
     import jax.numpy as jnp
@@ -717,7 +715,7 @@ class CLIPSegModelIntegrationTest(unittest.TestCase):
     def test_inference_image_segmentation(self):
         # TODO update to appropriate organization
         model_name = "nielsr/clipseg-rd64-refined"
-        processor = CLIPProcessor.from_pretrained(model_name)
+        processor = CLIPSegProcessor.from_pretrained(model_name)
         model = CLIPSegModel.from_pretrained(model_name).to(torch_device)
 
         image = prepare_img()
diff --git a/tests/models/clipseg/test_processor_clipseg.py b/tests/models/clipseg/test_processor_clipseg.py
new file mode 100644
index 0000000000000..b861662da6e57
--- /dev/null
+++ b/tests/models/clipseg/test_processor_clipseg.py
@@ -0,0 +1,189 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers import CLIPTokenizer, CLIPTokenizerFast
+from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_vision
+from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ViTFeatureExtractor, CLIPSegProcessor
+
+
+@require_vision
+class CLIPSegProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        # fmt: off
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]
+        # fmt: on
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+        feature_extractor_map = {
+            "do_resize": True,
+            "size": 20,
+            "do_center_crop": True,
+            "crop_size": 18,
+            "do_normalize": True,
+            "image_mean": [0.48145466, 0.4578275, 0.40821073],
+            "image_std": [0.26862954, 0.26130258, 0.27577711],
+        }
+        self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
+        with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
+            json.dump(feature_extractor_map, fp)
+
+    def get_tokenizer(self, **kwargs):
+        return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs):
+        return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_feature_extractor(self, **kwargs):
+        return ViTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    def test_save_load_pretrained_default(self):
+        tokenizer_slow = self.get_tokenizer()
+        tokenizer_fast = self.get_rust_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+
+        processor_slow = CLIPSegProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor)
+        processor_slow.save_pretrained(self.tmpdirname)
+        processor_slow = CLIPSegProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+
+        processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor)
+        processor_fast.save_pretrained(self.tmpdirname)
+        processor_fast = CLIPSegProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab())
+        self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab())
+        self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab())
+        self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
+        self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
+
+        self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(processor_slow.feature_extractor, ViTFeatureExtractor)
+        self.assertIsInstance(processor_fast.feature_extractor, ViTFeatureExtractor)
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
+
+        processor = CLIPSegProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, ViTFeatureExtractor)
+
+    def test_feature_extractor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        image_input = self.prepare_image_inputs()
+
+        input_feat_extract = feature_extractor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        input_str = "lower newer"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tok = tokenizer(input_str)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_processor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"])
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+
+    def test_tokenizer_decode(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
\ No newline at end of file

From 061ce9f036e2eaf6aa87c40ca43f5b73b55f3280 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 4 Nov 2022 11:03:15 +0100
Subject: [PATCH 31/47] Fix conversion script

---
 .../convert_clipseg_original_pytorch_to_hf.py | 24 ++++++++++---------
 src/transformers/models/clipseg/test.py       |  7 ------
 .../models/clipseg/test_processor_clipseg.py  |  4 ++--
 3 files changed, 15 insertions(+), 20 deletions(-)
 delete mode 100644 src/transformers/models/clipseg/test.py

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index c26bda6308544..51966e033c768 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -16,11 +16,11 @@
 """Convert CLIPSeg checkpoints from the original repository. URL: https://github.com/timojl/clipseg."""
 
 import argparse
-import requests
 
 import torch
 from PIL import Image
 
+import requests
 from transformers import (
     CLIPSegConfig,
     CLIPSegForImageSegmentation,
@@ -161,6 +161,7 @@ def prepare_img():
     image = Image.open(requests.get(url, stream=True).raw)
     return image
 
+
 def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub):
     config = get_clipseg_config(model_name)
     model = CLIPSegForImageSegmentation(config)
@@ -189,21 +190,24 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     image = prepare_img()
     text = ["a glass", "something to fill", "wood", "a jar"]
 
-    inputs = processor(text=text, images=image, padding="max_length", return_tensors="pt")
+    inputs = processor(text=text, images=[image] * len(text), padding="max_length", return_tensors="pt")
+
+    for k, v in inputs.items():
+        print(k, v.shape)
 
     with torch.no_grad():
-        outputs = model(input_ids, pixel_values.repeat(len(prompts), 1, 1, 1))
+        outputs = model(**inputs)
 
     # verify values
-    expected_cond = torch.tensor([0.0548, 0.0067, -0.1543])
-    expected_pooled_output = torch.tensor([0.2208, -0.7577, -0.1391])
+    expected_cond = torch.tensor([0.1110, -0.1882, 0.1645])
+    expected_pooled_output = torch.tensor([0.2692, -0.7197, -0.1328])
     if "refined" in model_name:
         expected_masks_slice = torch.tensor(
-            [[-6.8533, -6.8308, -6.6634], [-6.7272, -6.4926, -6.4597], [-6.4338, -6.2161, -6.2296]]
+            [[-10.0407, -9.9431, -10.2646], [-9.9751, -9.7064, -9.9586], [-9.6891, -9.5645, -9.9618]]
         )
     else:
         expected_masks_slice = torch.tensor(
-            [[-4.1992, -4.1912, -4.1523], [-4.1509, -4.1442, -4.1091], [-4.0581, -4.0355, -4.0107]]
+            [[-7.2877, -7.2711, -7.2463], [-7.2652, -7.2780, -7.2520], [-7.2239, -7.2204, -7.2001]]
         )
 
     assert torch.allclose(outputs.predicted_masks[0, :3, :3], expected_masks_slice, atol=1e-3)
@@ -212,11 +216,9 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
-        print(f"Saving model to {pytorch_dump_folder_path}")
+        print(f"Saving model and processor to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
-
-        # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-        # feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         print(f"Pushing model and processor for {model_name} to the hub")
diff --git a/src/transformers/models/clipseg/test.py b/src/transformers/models/clipseg/test.py
deleted file mode 100644
index 810eb39444435..0000000000000
--- a/src/transformers/models/clipseg/test.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from transformers import CLIPSegConfig, CLIPSegForImageSegmentation
-
-
-model = CLIPSegForImageSegmentation(CLIPSegConfig())
-
-for name, param in model.named_parameters():
-    print(name, param.shape)
diff --git a/tests/models/clipseg/test_processor_clipseg.py b/tests/models/clipseg/test_processor_clipseg.py
index b861662da6e57..af3a8e0193275 100644
--- a/tests/models/clipseg/test_processor_clipseg.py
+++ b/tests/models/clipseg/test_processor_clipseg.py
@@ -30,7 +30,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor, CLIPSegProcessor
+    from transformers import CLIPSegProcessor, ViTFeatureExtractor
 
 
 @require_vision
@@ -186,4 +186,4 @@ def test_tokenizer_decode(self):
         decoded_processor = processor.batch_decode(predicted_ids)
         decoded_tok = tokenizer.batch_decode(predicted_ids)
 
-        self.assertListEqual(decoded_tok, decoded_processor)
\ No newline at end of file
+        self.assertListEqual(decoded_tok, decoded_processor)

From f234652e9c9f47b8d904aeb21720eb307348b68d Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 4 Nov 2022 12:13:20 +0100
Subject: [PATCH 32/47] Fix most tests

---
 .../convert_clipseg_original_pytorch_to_hf.py |  7 ++-----
 .../models/clipseg/modeling_clipseg.py        |  6 ++++--
 tests/models/clipseg/test_modeling_clipseg.py | 21 ++++++++++++-------
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 51966e033c768..411f23b53a48e 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -192,14 +192,11 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
 
     inputs = processor(text=text, images=[image] * len(text), padding="max_length", return_tensors="pt")
 
-    for k, v in inputs.items():
-        print(k, v.shape)
-
     with torch.no_grad():
         outputs = model(**inputs)
 
     # verify values
-    expected_cond = torch.tensor([0.1110, -0.1882, 0.1645])
+    expected_conditional = torch.tensor([0.1110, -0.1882, 0.1645])
     expected_pooled_output = torch.tensor([0.2692, -0.7197, -0.1328])
     if "refined" in model_name:
         expected_masks_slice = torch.tensor(
@@ -211,7 +208,7 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
         )
 
     assert torch.allclose(outputs.predicted_masks[0, :3, :3], expected_masks_slice, atol=1e-3)
-    assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_cond, atol=1e-3)
+    assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3)
     assert torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3)
     print("Looks ok!")
 
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 823f1407548f9..5e8f1745469b0 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -38,10 +38,12 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "organization/clipseg-rd64-uni"
+
+_CHECKPOINT_FOR_DOC = "nielsr/clipseg-rd64-refined"
 
 CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "organization/clipseg-rd64-uni",
+    # TODO update organziation
+    "nielsr/clipseg-rd64-refined",
     # See all CLIPSeg models at https://huggingface.co/models?filter=clipseg
 ]
 
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 79062fd1350be..3ed1e1dae16b5 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -716,12 +716,11 @@ def test_inference_image_segmentation(self):
         # TODO update to appropriate organization
         model_name = "nielsr/clipseg-rd64-refined"
         processor = CLIPSegProcessor.from_pretrained(model_name)
-        model = CLIPSegModel.from_pretrained(model_name).to(torch_device)
+        model = CLIPSegForImageSegmentation.from_pretrained(model_name).to(torch_device)
 
         image = prepare_img()
-        inputs = processor(
-            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt"
-        ).to(torch_device)
+        texts = ["a cat", "a remote", "a blanket"]
+        inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -730,9 +729,15 @@ def test_inference_image_segmentation(self):
         # verify the predicted masks
         self.assertEqual(
             outputs.predicted_masks.shape,
-            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+            torch.Size((3, 352, 352)),
         )
+        expected_masks_slice = torch.tensor(
+            [[-7.4577, -7.4952, -7.4072], [-7.3115, -7.0969, -7.1624], [-6.9472, -6.7641, -6.8911]]
+        )
+        self.assertTrue(torch.allclose(outputs.predicted_masks[0, :3, :3], expected_masks_slice, atol=1e-3))
 
-        expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)
-
-        self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
+        # verify conditional and pooled output
+        expected_conditional = torch.tensor([0.5601, -0.0314, 0.1980])
+        expected_pooled_output = torch.tensor([0.2692, -0.7197, -0.1328])
+        self.assertTrue(torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3))
+        self.assertTrue(torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3))
\ No newline at end of file

From a2a5213da9af745617c02dcbcfa6807294f849e1 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 4 Nov 2022 14:33:37 +0100
Subject: [PATCH 33/47] Fix equivalence test

---
 .../models/clipseg/modeling_clipseg.py        | 93 +++++++++++++++++--
 src/transformers/models/clipseg/test.py       | 54 +++++++++++
 tests/models/clipseg/test_modeling_clipseg.py |  6 +-
 3 files changed, 141 insertions(+), 12 deletions(-)
 create mode 100644 src/transformers/models/clipseg/test.py

diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 5e8f1745469b0..975c13f4fddaa 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -115,6 +115,26 @@ def to_tuple(self) -> Tuple[Any]:
         )
 
 
+@dataclass
+class CLIPSegDecoderOutput(ModelOutput):
+    """
+    Args:
+        predicted_masks (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            ...
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    predicted_masks: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
 @dataclass
 class CLIPSegImageSegmentationOutput(ModelOutput):
     """
@@ -131,10 +151,11 @@ class CLIPSegImageSegmentationOutput(ModelOutput):
     conditional_embeddings: torch.FloatTensor = None
     pooled_output: torch.FloatTensor = None
     vision_model_output: BaseModelOutputWithPooling = None
+    decoder_output: CLIPSegDecoderOutput = None
 
     def to_tuple(self) -> Tuple[Any]:
         return tuple(
-            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            self[k] if k not in ["vision_model_output", "decoder_output"] else getattr(self, k).to_tuple()
             for k in self.keys()
         )
 
@@ -1243,7 +1264,17 @@ def __init__(self, config: CLIPSegConfig):
         decoder_config.hidden_act = "relu"
         self.layers = nn.ModuleList([CLIPSegDecoderLayer(decoder_config) for _ in range(len(config.extract_layers))])
 
-    def forward(self, hidden_states, conditional_embeddings):
+    def forward(
+        self,
+        hidden_states: Tuple[torch.Tensor],
+        conditional_embeddings: torch.Tensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
         activations = hidden_states[::-1]
 
         a = None
@@ -1257,7 +1288,17 @@ def forward(self, hidden_states, conditional_embeddings):
                 a = self.film_mul(conditional_embeddings) * a.permute(1, 0, 2) + self.film_add(conditional_embeddings)
                 a = a.permute(1, 0, 2)
 
-            a = layer(a, attention_mask=None, causal_attention_mask=None)[0]
+            layer_outputs = layer(
+                a, attention_mask=None, causal_attention_mask=None, output_attentions=output_attentions
+            )
+
+            a = layer_outputs[0]
+
+            if output_hidden_states:
+                all_hidden_states += (a,)
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
 
         a = a[:, 1:, :].permute(0, 2, 1)  # remove cls token and reshape to [batch_size, reduce_dim, seq_len]
 
@@ -1266,9 +1307,24 @@ def forward(self, hidden_states, conditional_embeddings):
         batch_size = conditional_embeddings.shape[0]
         a = a.view(batch_size, a.shape[1], size, size)
 
-        a = self.transposed_convolution(a)
+        a = self.transposed_convolution(a).squeeze()
 
-        return a
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    a,
+                    all_hidden_states,
+                    all_attentions,
+                ]
+                if v is not None
+            )
+
+        return CLIPSegDecoderOutput(
+            predicted_masks=a,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
 
 
 class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel):
@@ -1347,6 +1403,19 @@ def forward(
             # we add +1 here as the hidden states also include the initial embeddings
             activations = [hidden_states[i + 1] for i in self.extract_layers]
 
+            # update vision_outputs
+            if return_dict:
+                vision_outputs = BaseModelOutputWithPooling(
+                    last_hidden_state=vision_outputs.last_hidden_state,
+                    pooler_output=vision_outputs.pooler_output,
+                    hidden_states=vision_outputs.hidden_states if output_hidden_states else None,
+                    attentions=vision_outputs.attentions,
+                )
+            else:
+                vision_outputs = (
+                    vision_outputs[:2] + vision_outputs[3:] if not output_hidden_states else vision_outputs
+                )
+
         # step 2: compute conditional embeddings, either from text, images or an own provided embedding
         if conditional_embeddings is None:
             conditional_embeddings = self.get_conditional_embeddings(
@@ -1367,10 +1436,15 @@ def forward(
                     " `config.projection_dim`."
                 )
 
-        predicted_masks = self.decoder(activations, conditional_embeddings).squeeze()
+        decoder_outputs = self.decoder(
+            activations,
+            conditional_embeddings,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
 
-        if output_hidden_states:
-            raise NotImplementedError("To do")
+        predicted_masks = decoder_outputs.predicted_masks if return_dict else decoder_outputs[0]
 
         loss = None
         if labels is not None:
@@ -1378,7 +1452,7 @@ def forward(
             loss = loss_fn(predicted_masks, labels)
 
         if not return_dict:
-            output = (predicted_masks, conditional_embeddings, pooled_output) + vision_outputs
+            output = (predicted_masks, conditional_embeddings, pooled_output, vision_outputs, decoder_outputs)
             return ((loss,) + output) if loss is not None else output
 
         return CLIPSegImageSegmentationOutput(
@@ -1387,4 +1461,5 @@ def forward(
             conditional_embeddings=conditional_embeddings,
             pooled_output=pooled_output,
             vision_model_output=vision_outputs,
+            decoder_output=decoder_outputs,
         )
diff --git a/src/transformers/models/clipseg/test.py b/src/transformers/models/clipseg/test.py
new file mode 100644
index 0000000000000..2f5d5c2ff5bff
--- /dev/null
+++ b/src/transformers/models/clipseg/test.py
@@ -0,0 +1,54 @@
+from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation
+from PIL import Image
+import requests
+import torch
+
+model_name = "nielsr/clipseg-rd64-refined"
+processor = CLIPSegProcessor.from_pretrained(model_name)
+model = CLIPSegForImageSegmentation.from_pretrained(model_name)
+
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+
+image = prepare_img()
+texts = ["a cat", "a remote", "a blanket"]
+inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")
+
+# forward pass: return dict
+with torch.no_grad():
+    dict_outputs = model(**inputs, output_attentions=True)
+
+# forward pass: return tuple
+with torch.no_grad():
+    tuple_outputs = model(**inputs, output_attentions=True, return_dict=False)
+
+for idx, key in enumerate(dict_outputs.keys()):
+    if idx < 3:
+        assert torch.allclose(dict_outputs[key], tuple_outputs[idx])
+    elif key == "vision_model_output":
+        for i, vision_key in enumerate(dict_outputs[key].keys()):
+            # last hidden state, pooler output
+            if isinstance(dict_outputs["vision_model_output"][vision_key], torch.Tensor):
+                assert torch.allclose(dict_outputs["vision_model_output"][vision_key], tuple_outputs[idx][i])
+            # attentions
+            else:
+                print("Key:", vision_key)
+                for j, value in enumerate(dict_outputs["vision_model_output"][vision_key]):
+                    assert torch.allclose(value, tuple_outputs[idx][i][j])
+    elif key == "decoder_output":
+        for j, decoder_key in enumerate(dict_outputs["decoder_output"].keys()):
+            if isinstance(dict_outputs["decoder_output"][decoder_key], torch.Tensor):
+                assert torch.allclose(dict_outputs["decoder_output"][decoder_key], tuple_outputs[idx][j])
+
+# print(len(dict_outputs), len(tuple_outputs))
+
+# print(len(dict_outputs[-1]), len(tuple_outputs[-1]))
+
+# print(type(dict_outputs[-1]), type(tuple_outputs[-1]))
+
+# assert torch.allclose(dict_outputs[-1][0], tuple_outputs[-1][0])
+
+# for x, y in zip(dict_outputs[-1], tuple_outputs[-1]):
+#     assert torch.allclose(x, y)
\ No newline at end of file
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 3ed1e1dae16b5..5f310e119c4ea 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -704,8 +704,8 @@ def test_model_from_pretrained(self):
 # We will verify our results on an image of cute cats
 def prepare_img():
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
 
 
 @require_vision
@@ -740,4 +740,4 @@ def test_inference_image_segmentation(self):
         expected_conditional = torch.tensor([0.5601, -0.0314, 0.1980])
         expected_pooled_output = torch.tensor([0.2692, -0.7197, -0.1328])
         self.assertTrue(torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3))
-        self.assertTrue(torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3))
\ No newline at end of file
+        self.assertTrue(torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3))

From dca1473503e4428819bd7d327e1e6fdae60d9f70 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 4 Nov 2022 14:43:32 +0100
Subject: [PATCH 34/47] Fix README

---
 README.md                                              |  3 +--
 README_es.md                                           |  3 +--
 README_ja.md                                           |  3 +--
 README_ko.md                                           |  3 +--
 README_zh-hans.md                                      |  3 +--
 README_zh-hant.md                                      |  3 +--
 docs/source/en/index.mdx                               |  5 ++---
 docs/source/en/serialization.mdx                       |  2 +-
 src/transformers/models/auto/configuration_auto.py     |  4 ++--
 .../models/auto/feature_extraction_auto.py             |  1 +
 src/transformers/models/clipseg/test.py                | 10 +++++++---
 11 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 42bb91a5a6246..f2d65752d67a2 100644
--- a/README.md
+++ b/README.md
@@ -279,8 +279,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [Image Segmentation Using Text and Image Prompts](<FILL ARKIV LINK>) by Timo Lüddecke and Alexander Ecker.
-1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
diff --git a/README_es.md b/README_es.md
index fd8b60c4dd3b7..32156a08e2674 100644
--- a/README_es.md
+++ b/README_es.md
@@ -279,8 +279,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [Image Segmentation Using Text and Image Prompts](<FILL ARKIV LINK>) by Timo Lüddecke and Alexander Ecker.
-1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
diff --git a/README_ja.md b/README_ja.md
index 3866a7e29952b..edb49ce9d5c91 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -314,8 +314,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [Image Segmentation Using Text and Image Prompts](<FILL ARKIV LINK>) by Timo Lüddecke and Alexander Ecker.
-1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
diff --git a/README_ko.md b/README_ko.md
index abcd1cf6905f9..33bcdda6b6193 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -229,8 +229,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [Image Segmentation Using Text and Image Prompts](<FILL ARKIV LINK>) by Timo Lüddecke and Alexander Ecker.
-1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 8a828b6280e10..dbf8c8b8e21ef 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -253,8 +253,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (来自 <FILL INSTITUTION>) 伴随论文 [Image Segmentation Using Text and Image Prompts](<FILL ARKIV LINK>) 由 Timo Lüddecke and Alexander Ecker 发布。
-1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (来自 University of Göttingen) 伴随论文 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 由 Timo Lüddecke and Alexander Ecker 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 发布。
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 4ead564bc5b9d..92ca90cecdd8e 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -265,8 +265,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [Image Segmentation Using Text and Image Prompts](<FILL ARKIV LINK>) by Timo Lüddecke and Alexander Ecker.
-1. **[CLIPSegSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[CLIPSeg](https://huggingface.co/docs/transformers/main/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 87fc718ba5a36..5f446f21b5365 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -67,8 +67,7 @@ The documentation is organized into five sections:
 1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [Image Segmentation Using Text and Image Prompts](<FILL ARKIV LINK>) by Timo Lüddecke and Alexander Ecker.
-1. **[CLIPSegSeg](model_doc/clipseg)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
@@ -225,7 +224,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|         CLIPSegSeg          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           CLIPSeg           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |      Conditional DETR       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 14557218b820c..e32bb6efd0852 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -56,7 +56,7 @@ Ready-made configurations include the following architectures:
 - BLOOM
 - CamemBERT
 - CLIP
-- CLIPSegSeg
+- CLIPSeg
 - CodeGen
 - Conditional DETR
 - ConvBERT
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 0f7a83970bc8d..d8b59f123f676 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -183,7 +183,7 @@
         ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("clip", "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("clipseg", "CLIPSegSEG_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("clipseg", "CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("codegen", "CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("conditional_detr", "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convbert", "CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -317,7 +317,7 @@
         ("camembert", "CamemBERT"),
         ("canine", "CANINE"),
         ("clip", "CLIP"),
-        ("clipseg", "CLIPSegSeg"),
+        ("clipseg", "CLIPSeg"),
         ("codegen", "CodeGen"),
         ("conditional_detr", "Conditional DETR"),
         ("convbert", "ConvBERT"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 76d38f95ab151..bc30cc21b60d0 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,6 +39,7 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
+        ("clipseg", "ViTFeatureExtractor"),
         ("conditional_detr", "ConditionalDetrFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
diff --git a/src/transformers/models/clipseg/test.py b/src/transformers/models/clipseg/test.py
index 2f5d5c2ff5bff..ba42058bd6704 100644
--- a/src/transformers/models/clipseg/test.py
+++ b/src/transformers/models/clipseg/test.py
@@ -1,17 +1,21 @@
-from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation
+import torch
 from PIL import Image
+
 import requests
-import torch
+from transformers import CLIPSegForImageSegmentation, CLIPSegProcessor
+
 
 model_name = "nielsr/clipseg-rd64-refined"
 processor = CLIPSegProcessor.from_pretrained(model_name)
 model = CLIPSegForImageSegmentation.from_pretrained(model_name)
 
+
 def prepare_img():
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     image = Image.open(requests.get(url, stream=True).raw)
     return image
 
+
 image = prepare_img()
 texts = ["a cat", "a remote", "a blanket"]
 inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")
@@ -51,4 +55,4 @@ def prepare_img():
 # assert torch.allclose(dict_outputs[-1][0], tuple_outputs[-1][0])
 
 # for x, y in zip(dict_outputs[-1], tuple_outputs[-1]):
-#     assert torch.allclose(x, y)
\ No newline at end of file
+#     assert torch.allclose(x, y)

From cbd26c4735dc5173c3337474783c4011f8af1f8b Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 4 Nov 2022 15:01:38 +0100
Subject: [PATCH 35/47] Add model to doc tests

---
 .../models/clipseg/modeling_clipseg.py        | 64 +++++++++++++++----
 utils/documentation_tests.txt                 |  3 +-
 2 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 975c13f4fddaa..5c9c4c74c2153 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -807,8 +807,8 @@ def forward(
         ```python
         >>> from transformers import CLIPTokenizer, CLIPSegTextModel
 
-        >>> model = CLIPSegTextModel.from_pretrained("organization/clipseg-rd64-uni")
-        >>> tokenizer = CLIPTokenizer.from_pretrained("organization/clipseg-rd64-uni")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("nielsr/clipseg-rd64-refined")
+        >>> model = CLIPSegTextModel.from_pretrained("nielsr/clipseg-rd64-refined")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
 
@@ -916,10 +916,10 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, CLIPSegVisionModel
+        >>> from transformers import CLIPSegProcessor, CLIPSegVisionModel
 
-        >>> model = CLIPSegVisionModel.from_pretrained("organization/clipseg-rd64-uni")
-        >>> processor = CLIPProcessor.from_pretrained("organization/clipseg-rd64-uni")
+        >>> processor = CLIPSegProcessor.from_pretrained("nielsr/clipseg-rd64-refined")
+        >>> model = CLIPSegVisionModel.from_pretrained("nielsr/clipseg-rd64-refined")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -994,8 +994,8 @@ def get_text_features(
         ```python
         >>> from transformers import CLIPTokenizer, CLIPSegModel
 
-        >>> model = CLIPSegModel.from_pretrained("organization/clipseg-rd64-uni")
-        >>> tokenizer = CLIPTokenizer.from_pretrained("organization/clipseg-rd64-uni")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("nielsr/clipseg-rd64-refined")
+        >>> model = CLIPSegModel.from_pretrained("nielsr/clipseg-rd64-refined")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
         >>> text_features = model.get_text_features(**inputs)
@@ -1039,10 +1039,10 @@ def get_image_features(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, CLIPSegModel
+        >>> from transformers import CLIPSegProcessor, CLIPSegModel
 
-        >>> model = CLIPSegModel.from_pretrained("organization/clipseg-rd64-uni")
-        >>> processor = CLIPProcessor.from_pretrained("organization/clipseg-rd64-uni")
+        >>> processor = CLIPSegProcessor.from_pretrained("nielsr/clipseg-rd64-refined")
+        >>> model = CLIPSegModel.from_pretrained("nielsr/clipseg-rd64-refined")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1091,10 +1091,10 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, CLIPSegModel
+        >>> from transformers import CLIPSegProcessor, CLIPSegModel
 
-        >>> model = CLIPSegModel.from_pretrained("organization/clipseg-rd64-uni")
-        >>> processor = CLIPProcessor.from_pretrained("organization/clipseg-rd64-uni")
+        >>> processor = CLIPSegProcessor.from_pretrained("nielsr/clipseg-rd64-refined")
+        >>> model = CLIPSegModel.from_pretrained("nielsr/clipseg-rd64-refined")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1327,6 +1327,12 @@ def forward(
         )
 
 
+@add_start_docstrings(
+    """
+    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
+    """,
+    CLIPSEG_START_DOCSTRING,
+)
 class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel):
     config_class = CLIPSegConfig
 
@@ -1373,6 +1379,8 @@ def get_conditional_embeddings(
 
         return conditional_embeddings
 
+    @add_start_docstrings_to_model_forward(CLIPSEG_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLIPSegImageSegmentationOutput, config_class=CLIPSegTextConfig)
     def forward(
         self,
         input_ids: Optional[torch.FloatTensor] = None,
@@ -1386,7 +1394,35 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CLIPSegOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> processor = CLIPSegProcessor.from_pretrained("nielsr/clipseg-rd64-refined")
+        >>> model = CLIPSegForImageSegmentation.from_pretrained("nielsr/clipseg-rd64-refined")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> texts = ["a cat", "a remote", "a blanket"]
+        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
 
+        >>> predicted_masks = outputs.predicted_masks
+        >>> print(predicted_masks.shape)
+        torch.Size([3, 352, 352])
+        ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # step 1: forward the query images through the frozen CLIP vision encoder
@@ -1436,6 +1472,7 @@ def forward(
                     " `config.projection_dim`."
                 )
 
+        # step 3: forward both the pooled output and the activations through the lightweight decoder to predict masks
         decoder_outputs = self.decoder(
             activations,
             conditional_embeddings,
@@ -1443,7 +1480,6 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-
         predicted_masks = decoder_outputs.predicted_masks if return_dict else decoder_outputs[0]
 
         loss = None
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 53a74703d9659..18c7894da8312 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -35,6 +35,7 @@ src/transformers/models/bloom/configuration_bloom.py
 src/transformers/models/camembert/configuration_camembert.py
 src/transformers/models/canine/configuration_canine.py
 src/transformers/models/clip/configuration_clip.py
+src/transformers/models/clipseg/modeling_clipseg.py
 src/transformers/models/codegen/configuration_codegen.py
 src/transformers/models/conditional_detr/configuration_conditional_detr.py
 src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -187,4 +188,4 @@ src/transformers/models/xlnet/configuration_xlnet.py
 src/transformers/models/yolos/configuration_yolos.py
 src/transformers/models/yolos/modeling_yolos.py
 src/transformers/models/x_clip/modeling_x_clip.py
-src/transformers/models/yoso/configuration_yoso.py
+src/transformers/models/yoso/configuration_yoso.py
\ No newline at end of file

From 2d4c095f2e6f88ed546bd9ee70c91a2d68018b2b Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 4 Nov 2022 15:16:25 +0100
Subject: [PATCH 36/47] Use better variable name

---
 .../models/clipseg/modeling_clipseg.py        | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 5c9c4c74c2153..a4a3cf6bba5de 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -1277,43 +1277,45 @@ def forward(
 
         activations = hidden_states[::-1]
 
-        a = None
+        output = None
         for i, (activation, layer, reduce) in enumerate(zip(activations, self.layers, self.reduces)):
-            if a is not None:
-                a = reduce(activation) + a
+            if output is not None:
+                output = reduce(activation) + output
             else:
-                a = reduce(activation)
+                output = reduce(activation)
 
             if i == self.conditional_layer:
-                a = self.film_mul(conditional_embeddings) * a.permute(1, 0, 2) + self.film_add(conditional_embeddings)
-                a = a.permute(1, 0, 2)
+                output = self.film_mul(conditional_embeddings) * output.permute(1, 0, 2) + self.film_add(
+                    conditional_embeddings
+                )
+                output = output.permute(1, 0, 2)
 
             layer_outputs = layer(
-                a, attention_mask=None, causal_attention_mask=None, output_attentions=output_attentions
+                output, attention_mask=None, causal_attention_mask=None, output_attentions=output_attentions
             )
 
-            a = layer_outputs[0]
+            output = layer_outputs[0]
 
             if output_hidden_states:
-                all_hidden_states += (a,)
+                all_hidden_states += (output,)
 
             if output_attentions:
                 all_attentions += (layer_outputs[1],)
 
-        a = a[:, 1:, :].permute(0, 2, 1)  # remove cls token and reshape to [batch_size, reduce_dim, seq_len]
+        output = output[:, 1:, :].permute(0, 2, 1)  # remove cls token and reshape to [batch_size, reduce_dim, seq_len]
 
-        size = int(math.sqrt(a.shape[2]))
+        size = int(math.sqrt(output.shape[2]))
 
         batch_size = conditional_embeddings.shape[0]
-        a = a.view(batch_size, a.shape[1], size, size)
+        output = output.view(batch_size, output.shape[1], size, size)
 
-        a = self.transposed_convolution(a).squeeze()
+        output = self.transposed_convolution(output).squeeze()
 
         if not return_dict:
             return tuple(
                 v
                 for v in [
-                    a,
+                    output,
                     all_hidden_states,
                     all_attentions,
                 ]
@@ -1321,7 +1323,7 @@ def forward(
             )
 
         return CLIPSegDecoderOutput(
-            predicted_masks=a,
+            predicted_masks=output,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
         )

From 174a8f959da0b951c34df9acec167d62b490f9ec Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 4 Nov 2022 15:48:05 +0100
Subject: [PATCH 37/47] Convert other checkpoint as well

---
 .../convert_clipseg_original_pytorch_to_hf.py | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 411f23b53a48e..3b2131d14d517 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -37,9 +37,13 @@ def get_clipseg_config(model_name):
     vision_config = CLIPSegVisionConfig(patch_size=16)
 
     use_complex_transposed_convolution = True if "refined" in model_name else False
+    reduce_dim = 16 if "rd16" in model_name else 64
 
     config = CLIPSegConfig.from_text_vision_configs(
-        text_config, vision_config, use_complex_transposed_convolution=use_complex_transposed_convolution
+        text_config,
+        vision_config,
+        use_complex_transposed_convolution=use_complex_transposed_convolution,
+        reduce_dim=reduce_dim,
     )
     return config
 
@@ -198,14 +202,20 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     # verify values
     expected_conditional = torch.tensor([0.1110, -0.1882, 0.1645])
     expected_pooled_output = torch.tensor([0.2692, -0.7197, -0.1328])
-    if "refined" in model_name:
+    if model_name == "clipseg-rd64-refined":
         expected_masks_slice = torch.tensor(
             [[-10.0407, -9.9431, -10.2646], [-9.9751, -9.7064, -9.9586], [-9.6891, -9.5645, -9.9618]]
         )
-    else:
+    elif model_name == "clipseg-rd64":
         expected_masks_slice = torch.tensor(
             [[-7.2877, -7.2711, -7.2463], [-7.2652, -7.2780, -7.2520], [-7.2239, -7.2204, -7.2001]]
         )
+    elif model_name == "clipseg-rd16":
+        expected_masks_slice = torch.tensor(
+            [[-6.3955, -6.4055, -6.4151], [-6.3911, -6.4033, -6.4100], [-6.3474, -6.3702, -6.3762]]
+        )
+    else:
+        raise ValueError(f"Model name {model_name} not supported.")
 
     assert torch.allclose(outputs.predicted_masks[0, :3, :3], expected_masks_slice, atol=1e-3)
     assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3)
@@ -228,9 +238,9 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     # Required parameters
     parser.add_argument(
         "--model_name",
-        default="clipseg",
+        default="clipseg-rd64",
         type=str,
-        choices=["clipseg", "clipseg-rd16", "clipseg-rd64-refined"],
+        choices=["clipseg-rd16", "clipseg-rd64", "clipseg-rd64-refined"],
         help=(
             "Name of the model. Supported models are: clipseg-rd64, clipseg-rd16 and clipseg-rd64-refined (rd meaning"
             " reduce dimension)"

From a8eb2de52ca17f8bfd142924345f18321a50bfb9 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 7 Nov 2022 14:19:43 +0100
Subject: [PATCH 38/47] Update config, add link to paper

---
 docs/source/en/model_doc/clipseg.mdx          |  2 +-
 docs/source/en/serialization.mdx              |  1 -
 .../models/clipseg/configuration_clipseg.py   | 91 +++++--------------
 3 files changed, 23 insertions(+), 71 deletions(-)

diff --git a/docs/source/en/model_doc/clipseg.mdx b/docs/source/en/model_doc/clipseg.mdx
index 41083ddff2d2d..c72154883d63b 100644
--- a/docs/source/en/model_doc/clipseg.mdx
+++ b/docs/source/en/model_doc/clipseg.mdx
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The CLIPSeg model was proposed in [Image Segmentation Using Text and Image Prompts](<INSERT PAPER LINK HERE>) by Timo Lüddecke
+The CLIPSeg model was proposed in [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke
 and Alexander Ecker. CLIPSeg adds a minimal decoder on top of a frozen [CLIP](clip) model for zero- and one-shot image segmentation.
 
 The abstract from the paper is the following:
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index e32bb6efd0852..1cbc1237f286b 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -56,7 +56,6 @@ Ready-made configurations include the following architectures:
 - BLOOM
 - CamemBERT
 - CLIP
-- CLIPSeg
 - CodeGen
 - Conditional DETR
 - ConvBERT
diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index a0f49463059da..4183ce6c247ae 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -16,16 +16,9 @@
 
 import copy
 import os
-from collections import OrderedDict
-from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
-
-
-if TYPE_CHECKING:
-    from ...processing_utils import ProcessorMixin
-    from ...utils import TensorType
+from typing import Union
 
 from ...configuration_utils import PretrainedConfig
-from ...onnx import OnnxConfig
 from ...utils import logging
 
 
@@ -259,9 +252,9 @@ class CLIPSegConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        text_config_dict (`dict`, *optional*):
+        text_config (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`CLIPSegTextConfig`].
-        vision_config_dict (`dict`, *optional*):
+        vision_config (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`CLIPSegVisionConfig`].
         projection_dim (`int`, *optional*, defaults to 512):
             Dimensionality of text and vision projection layers.
@@ -317,8 +310,8 @@ class CLIPSegConfig(PretrainedConfig):
 
     def __init__(
         self,
-        text_config_dict=None,
-        vision_config_dict=None,
+        text_config=None,
+        vision_config=None,
         projection_dim=512,
         logit_scale_init_value=2.6592,
         extract_layers=[3, 6, 9],
@@ -331,18 +324,25 @@ def __init__(
         use_complex_transposed_convolution=False,
         **kwargs
     ):
-        super().__init__(text_config_dict=text_config_dict, vision_config_dict=vision_config_dict, **kwargs)
+        super().__init__(**kwargs)
+
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+        if text_config_dict is not None:
+            text_config = text_config_dict
+        if vision_config_dict is not None:
+            vision_config = vision_config_dict
 
-        if text_config_dict is None:
-            text_config_dict = {}
-            logger.info("text_config_dict is None. Initializing the CLIPSegTextConfig with default values.")
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the CLIPSegTextConfig with default values.")
 
-        if vision_config_dict is None:
-            vision_config_dict = {}
-            logger.info("vision_config_dict is None. initializing the CLIPSegVisionConfig with default values.")
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the CLIPSegVisionConfig with default values.")
 
-        self.text_config = CLIPSegTextConfig(**text_config_dict)
-        self.vision_config = CLIPSegVisionConfig(**vision_config_dict)
+        self.text_config = CLIPSegTextConfig(**text_config)
+        self.vision_config = CLIPSegVisionConfig(**vision_config)
 
         self.projection_dim = projection_dim
         self.logit_scale_init_value = logit_scale_init_value
@@ -366,7 +366,7 @@ def from_text_vision_configs(cls, text_config: CLIPSegTextConfig, vision_config:
             [`CLIPSegConfig`]: An instance of a configuration object
         """
 
-        return cls(text_config_dict=text_config.to_dict(), vision_config_dict=vision_config.to_dict(), **kwargs)
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
 
     def to_dict(self):
         """
@@ -380,50 +380,3 @@ def to_dict(self):
         output["vision_config"] = self.vision_config.to_dict()
         output["model_type"] = self.__class__.model_type
         return output
-
-
-class CLIPSegOnnxConfig(OnnxConfig):
-    @property
-    def inputs(self) -> Mapping[str, Mapping[int, str]]:
-        return OrderedDict(
-            [
-                ("input_ids", {0: "batch", 1: "sequence"}),
-                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
-                ("attention_mask", {0: "batch", 1: "sequence"}),
-            ]
-        )
-
-    @property
-    def outputs(self) -> Mapping[str, Mapping[int, str]]:
-        return OrderedDict(
-            [
-                ("logits_per_image", {0: "batch"}),
-                ("logits_per_text", {0: "batch"}),
-                ("text_embeds", {0: "batch"}),
-                ("image_embeds", {0: "batch"}),
-            ]
-        )
-
-    @property
-    def atol_for_validation(self) -> float:
-        return 1e-4
-
-    def generate_dummy_inputs(
-        self,
-        processor: "ProcessorMixin",
-        batch_size: int = -1,
-        seq_length: int = -1,
-        framework: Optional["TensorType"] = None,
-    ) -> Mapping[str, Any]:
-
-        text_input_dict = super().generate_dummy_inputs(
-            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
-        )
-        image_input_dict = super().generate_dummy_inputs(
-            processor.feature_extractor, batch_size=batch_size, framework=framework
-        )
-        return {**text_input_dict, **image_input_dict}
-
-    @property
-    def default_onnx_opset(self) -> int:
-        return 14

From 96409fadfb4d96844e372d1dd70026b18664e85d Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 7 Nov 2022 14:34:33 +0100
Subject: [PATCH 39/47] Add docs

---
 .../models/clipseg/configuration_clipseg.py        |  5 ++---
 .../models/clipseg/modeling_clipseg.py             | 14 +++-----------
 tests/models/clipseg/test_modeling_clipseg.py      |  9 +--------
 tests/models/clipseg/test_processor_clipseg.py     |  3 +--
 4 files changed, 7 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index 4183ce6c247ae..3548d7c749292 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -39,7 +39,6 @@ class CLIPSegTextConfig(PretrainedConfig):
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
-
     Args:
         vocab_size (`int`, *optional*, defaults to 49408):
             Vocabulary size of the CLIPSeg text model. Defines the number of different tokens that can be represented
@@ -147,7 +146,6 @@ class CLIPSegVisionConfig(PretrainedConfig):
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
-
     Args:
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
@@ -275,7 +273,8 @@ class CLIPSegConfig(PretrainedConfig):
         decoder_intermediate_size (`int`, *optional*, defaults to 2048):
             Dimensionality of the "intermediate" (i.e., feed-forward) layers in the Transformer decoder.
         conditional_layer (`int`, *optional*, defaults to 0):
-            ...
+            The layer to use of the Transformer encoder whose activations will be combined with the condition
+            embeddings using FiLM (Feature-wise Linear Modulation). If 0, the last layer is used.
         use_complex_transposed_convolution (`bool`, *optional*, defaults to `False`):
             Whether to use a more complex transposed convolution in the decoder, enabling more fine-grained
             segmentation.
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index a4a3cf6bba5de..afcbe3116395a 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -119,8 +119,8 @@ def to_tuple(self) -> Tuple[Any]:
 class CLIPSegDecoderOutput(ModelOutput):
     """
     Args:
-        predicted_masks (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
-            ...
+        predicted_masks (`torch.FloatTensor` of shape `(batch_size, height, width)`):
+            The predicted masks, for each conditioning.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
@@ -1312,15 +1312,7 @@ def forward(
         output = self.transposed_convolution(output).squeeze()
 
         if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    output,
-                    all_hidden_states,
-                    all_attentions,
-                ]
-                if v is not None
-            )
+            return tuple(v for v in [output, all_hidden_states, all_attentions] if v is not None)
 
         return CLIPSegDecoderOutput(
             predicted_masks=output,
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 5f310e119c4ea..be0c00775d83e 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -408,14 +408,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class CLIPSegModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            CLIPSegModel,
-            CLIPSegForImageSegmentation,
-        )
-        if is_torch_available()
-        else ()
-    )
+    all_model_classes = (CLIPSegModel, CLIPSegForImageSegmentation) if is_torch_available() else ()
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
diff --git a/tests/models/clipseg/test_processor_clipseg.py b/tests/models/clipseg/test_processor_clipseg.py
index af3a8e0193275..6da7345f6a6c9 100644
--- a/tests/models/clipseg/test_processor_clipseg.py
+++ b/tests/models/clipseg/test_processor_clipseg.py
@@ -79,8 +79,7 @@ def tearDown(self):
 
     def prepare_image_inputs(self):
         """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
+        or a list of PyTorch tensors if one specifies torchify=True."""
 
         image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
 

From fd7a2a67eb95e7f92905c3982dfbaec3978d65bc Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 7 Nov 2022 14:44:32 +0100
Subject: [PATCH 40/47] Update organization

---
 .../models/clipseg/modeling_clipseg.py        | 29 +++++++++----------
 tests/models/clipseg/test_modeling_clipseg.py |  3 +-
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index afcbe3116395a..95d8c3eb40f2c 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -39,11 +39,10 @@
 logger = logging.get_logger(__name__)
 
 
-_CHECKPOINT_FOR_DOC = "nielsr/clipseg-rd64-refined"
+_CHECKPOINT_FOR_DOC = "CIDAS/clipseg-rd64-refined"
 
 CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    # TODO update organziation
-    "nielsr/clipseg-rd64-refined",
+    "CIDAS/clipseg-rd64-refined",
     # See all CLIPSeg models at https://huggingface.co/models?filter=clipseg
 ]
 
@@ -807,8 +806,8 @@ def forward(
         ```python
         >>> from transformers import CLIPTokenizer, CLIPSegTextModel
 
-        >>> tokenizer = CLIPTokenizer.from_pretrained("nielsr/clipseg-rd64-refined")
-        >>> model = CLIPSegTextModel.from_pretrained("nielsr/clipseg-rd64-refined")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
 
@@ -918,8 +917,8 @@ def forward(
         >>> import requests
         >>> from transformers import CLIPSegProcessor, CLIPSegVisionModel
 
-        >>> processor = CLIPSegProcessor.from_pretrained("nielsr/clipseg-rd64-refined")
-        >>> model = CLIPSegVisionModel.from_pretrained("nielsr/clipseg-rd64-refined")
+        >>> processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -994,8 +993,8 @@ def get_text_features(
         ```python
         >>> from transformers import CLIPTokenizer, CLIPSegModel
 
-        >>> tokenizer = CLIPTokenizer.from_pretrained("nielsr/clipseg-rd64-refined")
-        >>> model = CLIPSegModel.from_pretrained("nielsr/clipseg-rd64-refined")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
         >>> text_features = model.get_text_features(**inputs)
@@ -1041,8 +1040,8 @@ def get_image_features(
         >>> import requests
         >>> from transformers import CLIPSegProcessor, CLIPSegModel
 
-        >>> processor = CLIPSegProcessor.from_pretrained("nielsr/clipseg-rd64-refined")
-        >>> model = CLIPSegModel.from_pretrained("nielsr/clipseg-rd64-refined")
+        >>> processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1093,8 +1092,8 @@ def forward(
         >>> import requests
         >>> from transformers import CLIPSegProcessor, CLIPSegModel
 
-        >>> processor = CLIPSegProcessor.from_pretrained("nielsr/clipseg-rd64-refined")
-        >>> model = CLIPSegModel.from_pretrained("nielsr/clipseg-rd64-refined")
+        >>> processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1403,8 +1402,8 @@ def forward(
         >>> from PIL import Image
         >>> import requests
 
-        >>> processor = CLIPSegProcessor.from_pretrained("nielsr/clipseg-rd64-refined")
-        >>> model = CLIPSegForImageSegmentation.from_pretrained("nielsr/clipseg-rd64-refined")
+        >>> processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index be0c00775d83e..91bedc13c5a8f 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -706,8 +706,7 @@ def prepare_img():
 class CLIPSegModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_image_segmentation(self):
-        # TODO update to appropriate organization
-        model_name = "nielsr/clipseg-rd64-refined"
+        model_name = "CIDAS/clipseg-rd64-refined"
         processor = CLIPSegProcessor.from_pretrained(model_name)
         model = CLIPSegForImageSegmentation.from_pretrained(model_name).to(torch_device)
 

From cb0ff6d6507f9734871872b487ba572816a9d039 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 7 Nov 2022 15:17:09 +0100
Subject: [PATCH 41/47] Replace base_model_prefix with clip

---
 .../convert_clipseg_original_pytorch_to_hf.py | 22 +++++++++----------
 .../models/clipseg/modeling_clipseg.py        | 11 +++++-----
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 3b2131d14d517..420da106ad523 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -51,7 +51,7 @@ def get_clipseg_config(model_name):
 def rename_key(name):
     # update prefixes
     if "clip_model" in name:
-        name = name.replace("clip_model", "clipseg")
+        name = name.replace("clip_model", "clip")
     if "transformer" in name:
         if "visual" in name:
             name = name.replace("visual.transformer", "vision_model")
@@ -127,17 +127,15 @@ def convert_state_dict(orig_state_dict, config):
                 prefix = "text_model"
 
             if "weight" in key:
-                orig_state_dict[f"clipseg.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
-                orig_state_dict[f"clipseg.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
+                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
+                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
                     dim : dim * 2, :
                 ]
-                orig_state_dict[f"clipseg.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
+                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
             else:
-                orig_state_dict[f"clipseg.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
-                orig_state_dict[f"clipseg.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[
-                    dim : dim * 2
-                ]
-                orig_state_dict[f"clipseg.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
+                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
+                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
+                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
         elif "self_attn" in key and "out_proj" not in key:
             key_split = key.split(".")
             layer_num = int(key_split[1])
@@ -182,7 +180,7 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     state_dict = convert_state_dict(state_dict, config)
     missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
 
-    if missing_keys != ["clipseg.text_model.embeddings.position_ids", "clipseg.vision_model.embeddings.position_ids"]:
+    if missing_keys != ["clip.text_model.embeddings.position_ids", "clip.vision_model.embeddings.position_ids"]:
         raise ValueError("Missing keys that are not expected: {}".format(missing_keys))
     if unexpected_keys != ["decoder.reduce.weight", "decoder.reduce.bias"]:
         raise ValueError(f"Unexpected keys: {unexpected_keys}")
@@ -229,8 +227,8 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
 
     if push_to_hub:
         print(f"Pushing model and processor for {model_name} to the hub")
-        model.push_to_hub(f"nielsr/{model_name}")
-        processor.push_to_hub(f"nielsr/{model_name}")
+        model.push_to_hub(f"CIDAS/{model_name}")
+        processor.push_to_hub(f"CIDAS/{model_name}")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 95d8c3eb40f2c..19797421d9744 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -1334,8 +1334,7 @@ def __init__(self, config: CLIPSegConfig):
 
         self.config = config
 
-        # TODO perhaps use clip here?
-        self.clipseg = CLIPSegModel(config)
+        self.clip = CLIPSegModel(config)
         self.extract_layers = config.extract_layers
 
         self.decoder = CLIPSegDecoder(config)
@@ -1356,7 +1355,7 @@ def get_conditional_embeddings(
             if len(input_ids) != batch_size:
                 raise ValueError("Make sure to pass as many prompt texts as there are query images")
             with torch.no_grad():
-                conditional_embeddings = self.clipseg.get_text_features(
+                conditional_embeddings = self.clip.get_text_features(
                     input_ids, attention_mask=attention_mask, position_ids=position_ids
                 )
         elif conditional_pixel_values is not None:
@@ -1364,7 +1363,7 @@ def get_conditional_embeddings(
             if len(conditional_pixel_values) != batch_size:
                 raise ValueError("Make sure to pass as many prompt images as there are query images")
             with torch.no_grad():
-                conditional_embeddings = self.clipseg.get_image_features(conditional_pixel_values)
+                conditional_embeddings = self.clip.get_image_features(conditional_pixel_values)
         else:
             raise ValueError(
                 "Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`"
@@ -1420,13 +1419,13 @@ def forward(
 
         # step 1: forward the query images through the frozen CLIP vision encoder
         with torch.no_grad():
-            vision_outputs = self.clipseg.vision_model(
+            vision_outputs = self.clip.vision_model(
                 pixel_values=pixel_values,
                 output_attentions=output_attentions,
                 output_hidden_states=True,  # we need the intermediate hidden states
                 return_dict=return_dict,
             )
-            pooled_output = self.clipseg.visual_projection(vision_outputs[1])
+            pooled_output = self.clip.visual_projection(vision_outputs[1])
 
             hidden_states = vision_outputs.hidden_states if return_dict else vision_outputs[2]
             # we add +1 here as the hidden states also include the initial embeddings

From 8c26ccbd8799639ff092836383f63875d1ba81db Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 7 Nov 2022 16:00:34 +0100
Subject: [PATCH 42/47] Fix base_model_prefix

---
 src/transformers/models/clipseg/modeling_clipseg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 19797421d9744..01f367b593468 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -420,7 +420,7 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.clip.modeling_clip.CLIPPreTrainedModel with CLIP->CLIPSeg,clip->clipseg
+# Copied from transformers.models.clip.modeling_clip.CLIPPreTrainedModel with CLIP->CLIPSeg
 class CLIPSegPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -428,7 +428,7 @@ class CLIPSegPreTrainedModel(PreTrainedModel):
     """
 
     config_class = CLIPSegConfig
-    base_model_prefix = "clipseg"
+    base_model_prefix = "clip"
     supports_gradient_checkpointing = True
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 

From d79913b3ddee91b5bc2d1df25da8ccd6347f3846 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 7 Nov 2022 18:08:11 +0100
Subject: [PATCH 43/47] Fix checkpoint of config

---
 .../models/clipseg/configuration_clipseg.py    | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index 3548d7c749292..d4afff6d19a08 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -25,7 +25,7 @@
 logger = logging.get_logger(__name__)
 
 CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "organization/clipseg-rd64-uni": "https://huggingface.co/organization/clipseg-rd64-uni/resolve/main/config.json",
+    "CIDAS/clipseg-rd64-refined": "https://huggingface.co/CIDAS/clipseg-rd64-refined/resolve/main/config.json",
 }
 
 
@@ -34,7 +34,7 @@ class CLIPSegTextConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an
     CLIPSeg model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the CLIPSeg
-    [organization/clipseg-rd64-uni](https://huggingface.co/organization/clipseg-rd64-uni) architecture.
+    [CIDAS/clipseg-rd64-refined](https://huggingface.co/CIDAS/clipseg-rd64-refined) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -73,10 +73,10 @@ class CLIPSegTextConfig(PretrainedConfig):
     ```python
     >>> from transformers import CLIPSegTextConfig, CLIPSegTextModel
 
-    >>> # Initializing a CLIPSegTextConfig with organization/clipseg-rd64-uni style configuration
+    >>> # Initializing a CLIPSegTextConfig with CIDAS/clipseg-rd64-refined style configuration
     >>> configuration = CLIPSegTextConfig()
 
-    >>> # Initializing a CLIPSegTextModel (with random weights) from the organization/clipseg-rd64-uni style configuration
+    >>> # Initializing a CLIPSegTextModel (with random weights) from the CIDAS/clipseg-rd64-refined style configuration
     >>> model = CLIPSegTextModel(configuration)
 
     >>> # Accessing the model configuration
@@ -141,7 +141,7 @@ class CLIPSegVisionConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an
     CLIPSeg model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the CLIPSeg
-    [organization/clipseg-rd64-uni](https://huggingface.co/organization/clipseg-rd64-uni) architecture.
+    [CIDAS/clipseg-rd64-refined](https://huggingface.co/CIDAS/clipseg-rd64-refined) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -178,10 +178,10 @@ class CLIPSegVisionConfig(PretrainedConfig):
     ```python
     >>> from transformers import CLIPSegVisionConfig, CLIPSegVisionModel
 
-    >>> # Initializing a CLIPSegVisionConfig with organization/clipseg-rd64-uni style configuration
+    >>> # Initializing a CLIPSegVisionConfig with CIDAS/clipseg-rd64-refined style configuration
     >>> configuration = CLIPSegVisionConfig()
 
-    >>> # Initializing a CLIPSegVisionModel (with random weights) from the organization/clipseg-rd64-uni style configuration
+    >>> # Initializing a CLIPSegVisionModel (with random weights) from the CIDAS/clipseg-rd64-refined style configuration
     >>> model = CLIPSegVisionModel(configuration)
 
     >>> # Accessing the model configuration
@@ -286,10 +286,10 @@ class CLIPSegConfig(PretrainedConfig):
     ```python
     >>> from transformers import CLIPSegConfig, CLIPSegModel
 
-    >>> # Initializing a CLIPSegConfig with organization/clipseg-rd64-uni style configuration
+    >>> # Initializing a CLIPSegConfig with CIDAS/clipseg-rd64-refined style configuration
     >>> configuration = CLIPSegConfig()
 
-    >>> # Initializing a CLIPSegModel (with random weights) from the organization/clipseg-rd64-uni style configuration
+    >>> # Initializing a CLIPSegModel (with random weights) from the CIDAS/clipseg-rd64-refined style configuration
     >>> model = CLIPSegModel(configuration)
 
     >>> # Accessing the model configuration

From 2a643a0bb22f983d7de624701a9a47fc2661df49 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 7 Nov 2022 18:33:25 +0100
Subject: [PATCH 44/47] Fix config checkpoint

---
 .../models/clipseg/configuration_clipseg.py   | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index d4afff6d19a08..1fe27b0d0b0f0 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -25,7 +25,7 @@
 logger = logging.get_logger(__name__)
 
 CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "CIDAS/clipseg-rd64-refined": "https://huggingface.co/CIDAS/clipseg-rd64-refined/resolve/main/config.json",
+    "CIDAS/clipseg-rd64": "https://huggingface.co/CIDAS/clipseg-rd64/resolve/main/config.json",
 }
 
 
@@ -34,7 +34,7 @@ class CLIPSegTextConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an
     CLIPSeg model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the CLIPSeg
-    [CIDAS/clipseg-rd64-refined](https://huggingface.co/CIDAS/clipseg-rd64-refined) architecture.
+    [CIDAS/clipseg-rd64](https://huggingface.co/CIDAS/clipseg-rd64) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -73,10 +73,10 @@ class CLIPSegTextConfig(PretrainedConfig):
     ```python
     >>> from transformers import CLIPSegTextConfig, CLIPSegTextModel
 
-    >>> # Initializing a CLIPSegTextConfig with CIDAS/clipseg-rd64-refined style configuration
+    >>> # Initializing a CLIPSegTextConfig with CIDAS/clipseg-rd64 style configuration
     >>> configuration = CLIPSegTextConfig()
 
-    >>> # Initializing a CLIPSegTextModel (with random weights) from the CIDAS/clipseg-rd64-refined style configuration
+    >>> # Initializing a CLIPSegTextModel (with random weights) from the CIDAS/clipseg-rd64 style configuration
     >>> model = CLIPSegTextModel(configuration)
 
     >>> # Accessing the model configuration
@@ -141,7 +141,7 @@ class CLIPSegVisionConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an
     CLIPSeg model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the CLIPSeg
-    [CIDAS/clipseg-rd64-refined](https://huggingface.co/CIDAS/clipseg-rd64-refined) architecture.
+    [CIDAS/clipseg-rd64](https://huggingface.co/CIDAS/clipseg-rd64) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -178,10 +178,10 @@ class CLIPSegVisionConfig(PretrainedConfig):
     ```python
     >>> from transformers import CLIPSegVisionConfig, CLIPSegVisionModel
 
-    >>> # Initializing a CLIPSegVisionConfig with CIDAS/clipseg-rd64-refined style configuration
+    >>> # Initializing a CLIPSegVisionConfig with CIDAS/clipseg-rd64 style configuration
     >>> configuration = CLIPSegVisionConfig()
 
-    >>> # Initializing a CLIPSegVisionModel (with random weights) from the CIDAS/clipseg-rd64-refined style configuration
+    >>> # Initializing a CLIPSegVisionModel (with random weights) from the CIDAS/clipseg-rd64 style configuration
     >>> model = CLIPSegVisionModel(configuration)
 
     >>> # Accessing the model configuration
@@ -244,7 +244,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 class CLIPSegConfig(PretrainedConfig):
     r"""
     [`CLIPSegConfig`] is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to
-    instantiate CLIPSeg model according to the specified arguments, defining the text model and vision model configs.
+    instantiate a CLIPSeg model according to the specified arguments, defining the text model and vision model configs.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the CLIPSeg
+    [CIDAS/clipseg-rd64](https://huggingface.co/CIDAS/clipseg-rd64) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -286,10 +288,10 @@ class CLIPSegConfig(PretrainedConfig):
     ```python
     >>> from transformers import CLIPSegConfig, CLIPSegModel
 
-    >>> # Initializing a CLIPSegConfig with CIDAS/clipseg-rd64-refined style configuration
+    >>> # Initializing a CLIPSegConfig with CIDAS/clipseg-rd64 style configuration
     >>> configuration = CLIPSegConfig()
 
-    >>> # Initializing a CLIPSegModel (with random weights) from the CIDAS/clipseg-rd64-refined style configuration
+    >>> # Initializing a CLIPSegModel (with random weights) from the CIDAS/clipseg-rd64 style configuration
     >>> model = CLIPSegModel(configuration)
 
     >>> # Accessing the model configuration

From 56ee6831817e9f002d251dc96139b80d60b3c74a Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 7 Nov 2022 19:46:13 +0100
Subject: [PATCH 45/47] Remove file

---
 src/transformers/models/clipseg/test.py | 58 -------------------------
 1 file changed, 58 deletions(-)
 delete mode 100644 src/transformers/models/clipseg/test.py

diff --git a/src/transformers/models/clipseg/test.py b/src/transformers/models/clipseg/test.py
deleted file mode 100644
index ba42058bd6704..0000000000000
--- a/src/transformers/models/clipseg/test.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import torch
-from PIL import Image
-
-import requests
-from transformers import CLIPSegForImageSegmentation, CLIPSegProcessor
-
-
-model_name = "nielsr/clipseg-rd64-refined"
-processor = CLIPSegProcessor.from_pretrained(model_name)
-model = CLIPSegForImageSegmentation.from_pretrained(model_name)
-
-
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-    return image
-
-
-image = prepare_img()
-texts = ["a cat", "a remote", "a blanket"]
-inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")
-
-# forward pass: return dict
-with torch.no_grad():
-    dict_outputs = model(**inputs, output_attentions=True)
-
-# forward pass: return tuple
-with torch.no_grad():
-    tuple_outputs = model(**inputs, output_attentions=True, return_dict=False)
-
-for idx, key in enumerate(dict_outputs.keys()):
-    if idx < 3:
-        assert torch.allclose(dict_outputs[key], tuple_outputs[idx])
-    elif key == "vision_model_output":
-        for i, vision_key in enumerate(dict_outputs[key].keys()):
-            # last hidden state, pooler output
-            if isinstance(dict_outputs["vision_model_output"][vision_key], torch.Tensor):
-                assert torch.allclose(dict_outputs["vision_model_output"][vision_key], tuple_outputs[idx][i])
-            # attentions
-            else:
-                print("Key:", vision_key)
-                for j, value in enumerate(dict_outputs["vision_model_output"][vision_key]):
-                    assert torch.allclose(value, tuple_outputs[idx][i][j])
-    elif key == "decoder_output":
-        for j, decoder_key in enumerate(dict_outputs["decoder_output"].keys()):
-            if isinstance(dict_outputs["decoder_output"][decoder_key], torch.Tensor):
-                assert torch.allclose(dict_outputs["decoder_output"][decoder_key], tuple_outputs[idx][j])
-
-# print(len(dict_outputs), len(tuple_outputs))
-
-# print(len(dict_outputs[-1]), len(tuple_outputs[-1]))
-
-# print(type(dict_outputs[-1]), type(tuple_outputs[-1]))
-
-# assert torch.allclose(dict_outputs[-1][0], tuple_outputs[-1][0])
-
-# for x, y in zip(dict_outputs[-1], tuple_outputs[-1]):
-#     assert torch.allclose(x, y)

From 3af99257e8449052c8c2d88a79403c1958181122 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 7 Nov 2022 20:04:48 +0100
Subject: [PATCH 46/47] Use logits for output

---
 .../convert_clipseg_original_pytorch_to_hf.py |  2 +-
 .../models/clipseg/modeling_clipseg.py        | 26 +++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 420da106ad523..778dbca299678 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -215,7 +215,7 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     else:
         raise ValueError(f"Model name {model_name} not supported.")
 
-    assert torch.allclose(outputs.predicted_masks[0, :3, :3], expected_masks_slice, atol=1e-3)
+    assert torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3)
     assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3)
     assert torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3)
     print("Looks ok!")
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 01f367b593468..87caf24ed4bf6 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -118,8 +118,8 @@ def to_tuple(self) -> Tuple[Any]:
 class CLIPSegDecoderOutput(ModelOutput):
     """
     Args:
-        predicted_masks (`torch.FloatTensor` of shape `(batch_size, height, width)`):
-            The predicted masks, for each conditioning.
+        logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
+            Classification scores for each pixel.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
@@ -129,7 +129,7 @@ class CLIPSegDecoderOutput(ModelOutput):
             the self-attention heads.
     """
 
-    predicted_masks: torch.FloatTensor = None
+    logits: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -146,7 +146,7 @@ class CLIPSegImageSegmentationOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    predicted_masks: torch.FloatTensor = None
+    logits: torch.FloatTensor = None
     conditional_embeddings: torch.FloatTensor = None
     pooled_output: torch.FloatTensor = None
     vision_model_output: BaseModelOutputWithPooling = None
@@ -1308,13 +1308,13 @@ def forward(
         batch_size = conditional_embeddings.shape[0]
         output = output.view(batch_size, output.shape[1], size, size)
 
-        output = self.transposed_convolution(output).squeeze()
+        logits = self.transposed_convolution(output).squeeze()
 
         if not return_dict:
-            return tuple(v for v in [output, all_hidden_states, all_attentions] if v is not None)
+            return tuple(v for v in [logits, all_hidden_states, all_attentions] if v is not None)
 
         return CLIPSegDecoderOutput(
-            predicted_masks=output,
+            logits=logits,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
         )
@@ -1411,8 +1411,8 @@ def forward(
 
         >>> outputs = model(**inputs)
 
-        >>> predicted_masks = outputs.predicted_masks
-        >>> print(predicted_masks.shape)
+        >>> logits = outputs.logits
+        >>> print(logits.shape)
         torch.Size([3, 352, 352])
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1472,20 +1472,20 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        predicted_masks = decoder_outputs.predicted_masks if return_dict else decoder_outputs[0]
+        logits = decoder_outputs.logits if return_dict else decoder_outputs[0]
 
         loss = None
         if labels is not None:
             loss_fn = nn.BCEWithLogitsLoss()
-            loss = loss_fn(predicted_masks, labels)
+            loss = loss_fn(logits, labels)
 
         if not return_dict:
-            output = (predicted_masks, conditional_embeddings, pooled_output, vision_outputs, decoder_outputs)
+            output = (logits, conditional_embeddings, pooled_output, vision_outputs, decoder_outputs)
             return ((loss,) + output) if loss is not None else output
 
         return CLIPSegImageSegmentationOutput(
             loss=loss,
-            predicted_masks=predicted_masks,
+            logits=logits,
             conditional_embeddings=conditional_embeddings,
             pooled_output=pooled_output,
             vision_model_output=vision_outputs,

From 6e017b5472e33c638f52191a0446a55cf0ebf058 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 8 Nov 2022 09:03:11 +0100
Subject: [PATCH 47/47] Fix tests

---
 tests/models/clipseg/test_modeling_clipseg.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 91bedc13c5a8f..3a338ddbf820a 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -384,7 +384,7 @@ def create_and_check_model_for_image_segmentation(self, config, input_ids, atten
         with torch.no_grad():
             result = model(input_ids, pixel_values)
         self.parent.assertEqual(
-            result.predicted_masks.shape,
+            result.logits.shape,
             (
                 self.vision_model_tester.batch_size,
                 self.vision_model_tester.image_size,
@@ -720,13 +720,13 @@ def test_inference_image_segmentation(self):
 
         # verify the predicted masks
         self.assertEqual(
-            outputs.predicted_masks.shape,
+            outputs.logits.shape,
             torch.Size((3, 352, 352)),
         )
         expected_masks_slice = torch.tensor(
             [[-7.4577, -7.4952, -7.4072], [-7.3115, -7.0969, -7.1624], [-6.9472, -6.7641, -6.8911]]
         )
-        self.assertTrue(torch.allclose(outputs.predicted_masks[0, :3, :3], expected_masks_slice, atol=1e-3))
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3))
 
         # verify conditional and pooled output
         expected_conditional = torch.tensor([0.5601, -0.0314, 0.1980])