From 9fdafb78e8024e9a8f1c561d422d973614edb502 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:47:29 -0400
Subject: [PATCH 001/233] added conditional_detr files

---
 README.md                                     |    1 +
 README_ko.md                                  |    1 +
 README_zh-hans.md                             |    1 +
 README_zh-hant.md                             |    1 +
 docs/source/en/index.mdx                      |    2 +
 docs/source/en/model_doc/conditional_detr.mdx |   53 +
 docs/source/en/serialization.mdx              |    1 +
 src/transformers/__init__.py                  |   20 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 .../models/auto/feature_extraction_auto.py    |    2 +
 src/transformers/models/auto/modeling_auto.py |    3 +
 .../models/conditional_detr/__init__.py       |   87 +
 .../configuration_conditional_detr.py         |  242 ++
 ..._original_pytorch_checkpoint_to_pytorch.py |  298 ++
 .../feature_extraction_conditional_detr.py    |  938 ++++++
 .../modeling_conditional_detr.py              | 2547 +++++++++++++++++
 .../utils/dummy_vision_objects.py             |    7 +
 tests/models/conditional_detr/__init__.py     |    0
 ...est_feature_extraction_conditional_detr.py |  338 +++
 .../test_modeling_conditional_detr.py         |  538 ++++
 21 files changed, 5084 insertions(+)
 create mode 100644 docs/source/en/model_doc/conditional_detr.mdx
 create mode 100644 src/transformers/models/conditional_detr/__init__.py
 create mode 100644 src/transformers/models/conditional_detr/configuration_conditional_detr.py
 create mode 100644 src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
 create mode 100644 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
 create mode 100644 src/transformers/models/conditional_detr/modeling_conditional_detr.py
 create mode 100644 tests/models/conditional_detr/__init__.py
 create mode 100644 tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
 create mode 100644 tests/models/conditional_detr/test_modeling_conditional_detr.py

diff --git a/README.md b/README.md
index e832a113e488d..4c4096d971165 100644
--- a/README.md
+++ b/README.md
@@ -276,6 +276,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_ko.md b/README_ko.md
index 0566c31de2a41..56e2b8fcbea33 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -228,6 +228,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index a3bd914c09a05..57007f837d261 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -252,6 +252,7 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang发布。
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index f544dd5b53d92..b0510ee109e2b 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -264,6 +264,7 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 6a1722cf33ffa..336f0e9d2b817 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -68,6 +68,7 @@ The documentation is organized into five sections:
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
@@ -212,6 +213,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|      conditional_detr       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/en/model_doc/conditional_detr.mdx b/docs/source/en/model_doc/conditional_detr.mdx
new file mode 100644
index 0000000000000..51316a24e43ee
--- /dev/null
+++ b/docs/source/en/model_doc/conditional_detr.mdx
@@ -0,0 +1,53 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Conditional DETR
+
+## Overview
+
+The conditional DETR model was proposed in [ConditionalDETR](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR.
+
+The abstract from the paper is the following:
+
+*The recently-developed DETR approach applies the transformer encoder and decoder architecture to object detection and achieves promising performance. In this paper, we handle the critical issue, slow training convergence, and present a conditional cross-attention mechanism for fast DETR training. Our approach is motivated by that the cross-attention in DETR relies highly on the content embeddings for localizing the four extremities and predicting the box, which increases the need for high-quality content embeddings and thus the training difficulty. Our approach, named conditional DETR, learns a conditional spatial query from the decoder embedding for decoder multi-head cross-attention. The benefit is that through the conditional spatial query, each cross-attention head is able to attend to a band containing a distinct region, e.g., one object extremity or a region inside the object box. This narrows down the spatial range for localizing the distinct regions for object classification and box regression, thus relaxing the dependence on the content embeddings and easing the training. Empirical results show that conditional DETR converges 6.7× faster for the backbones R50 and R101 and 10× faster for stronger backbones DC5-R50 and DC5-R101. Code is available at https://github.com/Atten4Vis/ConditionalDETR.*
+
+
+This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The original code can be found [here](https://github.com/Atten4Vis/ConditionalDETR).
+
+
+## ConditionalDETRConfig
+
+[[autodoc]] ConditionalDETRConfig
+
+## ConditionalDETRFeatureExtractor
+
+[[autodoc]] ConditionalDETRFeatureExtractor
+    - __call__
+    - pad_and_create_pixel_mask
+    - post_process
+    - post_process_segmentation
+    - post_process_panoptic
+
+## ConditionalDETRModel
+
+[[autodoc]] ConditionalDETRModel
+    - forward
+
+## ConditionalDETRForObjectDetection
+
+[[autodoc]] ConditionalDETRForObjectDetection
+    - forward
+
+## ConditionalDETRForSegmentation
+
+[[autodoc]] ConditionalDETRForSegmentation
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 31ad430e06434..d7ff03e94d116 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -57,6 +57,7 @@ Ready-made configurations include the following architectures:
 - CamemBERT
 - CLIP
 - CodeGen
+- conditional_detr
 - ConvBERT
 - ConvNeXT
 - Data2VecText
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c94abb1dac6a3..03c6d930ea631 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -171,6 +171,7 @@
         "CLIPVisionConfig",
     ],
     "models.codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenTokenizer"],
+    "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDETRConfig"],
     "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
     "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
     "models.cpm": [],
@@ -652,6 +653,7 @@
     _import_structure["models.convnext"].append("ConvNextFeatureExtractor")
     _import_structure["models.deit"].append("DeiTFeatureExtractor")
     _import_structure["models.detr"].append("DetrFeatureExtractor")
+    _import_structure["models.conditional_detr"].append("ConditionalDETRFeatureExtractor")
     _import_structure["models.donut"].append("DonutFeatureExtractor")
     _import_structure["models.dpt"].append("DPTFeatureExtractor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaProcessor"])
@@ -692,6 +694,15 @@
             "DetrPreTrainedModel",
         ]
     )
+    _import_structure["models.conditional_detr"].extend(
+        [
+            "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ConditionalDETRForObjectDetection",
+            "ConditionalDETRForSegmentation",
+            "ConditionalDETRModel",
+            "ConditionalDETRPreTrainedModel",
+        ]
+    )
 
 try:
     if not is_scatter_available():
@@ -3034,6 +3045,7 @@
         CLIPVisionConfig,
     )
     from .models.codegen import CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP, CodeGenConfig, CodeGenTokenizer
+    from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDETRConfig
     from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
     from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
@@ -3453,6 +3465,7 @@
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
         from .models.clip import CLIPFeatureExtractor
+        from .models.conditional_detr import ConditionalDETRFeatureExtractor
         from .models.convnext import ConvNextFeatureExtractor
         from .models.deit import DeiTFeatureExtractor
         from .models.detr import DetrFeatureExtractor
@@ -3482,6 +3495,13 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_timm_objects import *
     else:
+        from .models.conditional_detr import (
+            CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ConditionalDETRForObjectDetection,
+            ConditionalDETRForSegmentation,
+            ConditionalDETRModel,
+            ConditionalDETRPreTrainedModel,
+        )
         from .models.detr import (
             DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
             DetrForObjectDetection,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 9cc42c8d1240f..a52a9ae1b298b 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -38,6 +38,7 @@
     canine,
     clip,
     codegen,
+    conditional_detr,
     convbert,
     convnext,
     cpm,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index fe50973ac7169..dcd730e60836b 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -43,6 +43,7 @@
         ("canine", "CanineConfig"),
         ("clip", "CLIPConfig"),
         ("codegen", "CodeGenConfig"),
+        ("conditional_detr", "ConditionalDETRConfig"),
         ("convbert", "ConvBertConfig"),
         ("convnext", "ConvNextConfig"),
         ("ctrl", "CTRLConfig"),
@@ -172,6 +173,7 @@
         ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("clip", "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("codegen", "CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("conditional_detr", "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convbert", "CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convnext", "CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ctrl", "CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -294,6 +296,7 @@
         ("canine", "CANINE"),
         ("clip", "CLIP"),
         ("codegen", "CodeGen"),
+        ("conditional_detr", "conditional_detr"),
         ("convbert", "ConvBERT"),
         ("convnext", "ConvNeXT"),
         ("cpm", "CPM"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 625b79db06494..6409674be7096 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,6 +39,8 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
+        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
+        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 79b58d49383d0..4de5373a0c1c6 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -42,6 +42,7 @@
         ("canine", "CanineModel"),
         ("clip", "CLIPModel"),
         ("codegen", "CodeGenModel"),
+        ("conditional_detr", "ConditionalDETRModel"),
         ("convbert", "ConvBertModel"),
         ("convnext", "ConvNextModel"),
         ("ctrl", "CTRLModel"),
@@ -365,6 +366,7 @@
     [
         # Do not add new models here, this class will be deprecated in the future.
         # Model for Image Segmentation mapping
+        ("conditional_detr", "ConditionalDETRForSegmentation"),
         ("detr", "DetrForSegmentation"),
     ]
 )
@@ -446,6 +448,7 @@
 MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Object Detection mapping
+        ("conditional_detr", "ConditionalDETRForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
     ]
diff --git a/src/transformers/models/conditional_detr/__init__.py b/src/transformers/models/conditional_detr/__init__.py
new file mode 100644
index 0000000000000..1b72e7d7a3238
--- /dev/null
+++ b/src/transformers/models/conditional_detr/__init__.py
@@ -0,0 +1,87 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_timm_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_conditional_detr": [
+        "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "ConditionalDETRConfig",
+        "ConditionalDETROnnxConfig",
+    ]
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["feature_extraction_conditional_detr"] = ["ConditionalDETRFeatureExtractor"]
+
+try:
+    if not is_timm_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_conditional_detr"] = [
+        "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "ConditionalDETRForObjectDetection",
+        "ConditionalDETRForSegmentation",
+        "ConditionalDETRModel",
+        "ConditionalDETRPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_conditional_detr import (
+        CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ConditionalDETRConfig,
+        ConditionalDETROnnxConfig,
+    )
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_conditional_detr import ConditionalDETRFeatureExtractor
+
+    try:
+        if not is_timm_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_conditional_detr import (
+            CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ConditionalDETRForObjectDetection,
+            ConditionalDETRForSegmentation,
+            ConditionalDETRModel,
+            ConditionalDETRPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
new file mode 100644
index 0000000000000..ce004dc6fc15c
--- /dev/null
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -0,0 +1,242 @@
+# coding=utf-8
+# Copyright 2022 Facebook AI Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CONDITIONAL_DETR model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "DeppMeng/ConditionalDETR": "https://huggingface.co/DeppMeng/ConditionalDETR/resolve/main/config.json",
+}
+
+
+class ConditionalDETRConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
+    a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
+    [DeppMeng/ConditionalDETR](https://huggingface.co/DeppMeng/ConditionalDETR) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_queries (`int`, *optional*, defaults to 100):
+            Number of object queries, i.e. detection slots. This is the maximal number of objects
+            [`ConditionalDETRModel`] can detect in a single image. For COCO, we recommend 100 queries.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers.
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        init_xavier_std (`float`, *optional*, defaults to 1):
+            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
+        backbone (`str`, *optional*, defaults to `"resnet50"`):
+            Name of convolutional backbone to use. Supports any convolutional backbone from the timm package. For a
+            list of all available models, see [this
+            page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
+        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
+            Whether to use pretrained weights for the backbone.
+        dilation (`bool`, *optional*, defaults to `False`):
+            Whether to replace stride with dilation in the last convolutional block (DC5).
+        class_cost (`float`, *optional*, defaults to 1):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        mask_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the Focal loss in the panoptic segmentation loss.
+        dice_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.1):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+
+    Examples:
+
+    ```python
+    >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
+
+    >>> # Initializing a CONDITIONAL_DETR DeppMeng/ConditionalDETR style configuration
+    >>> configuration = ConditionalDETRConfig()
+
+    >>> # Initializing a model from the DeppMeng/ConditionalDETR style configuration
+    >>> model = ConditionalDETRModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "conditional_detr"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        num_queries=300,
+        max_position_embeddings=1024,
+        encoder_layers=6,
+        encoder_ffn_dim=2048,
+        encoder_attention_heads=8,
+        decoder_layers=6,
+        decoder_ffn_dim=2048,
+        decoder_attention_heads=8,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        init_xavier_std=1.0,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        auxiliary_loss=False,
+        position_embedding_type="sine",
+        backbone="resnet50",
+        dilation=False,
+        class_cost=2,
+        bbox_cost=5,
+        giou_cost=2,
+        mask_loss_coefficient=1,
+        dice_loss_coefficient=1,
+        cls_loss_coefficient=2,
+        bbox_loss_coefficient=5,
+        giou_loss_coefficient=2,
+        focal_alpha=0.25,
+        **kwargs
+    ):
+        self.num_queries = num_queries
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.auxiliary_loss = auxiliary_loss
+        self.position_embedding_type = position_embedding_type
+        self.backbone = backbone
+        self.dilation = dilation
+        # Hungarian matcher
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        # Loss coefficients
+        self.mask_loss_coefficient = mask_loss_coefficient
+        self.dice_loss_coefficient = dice_loss_coefficient
+        self.cls_loss_coefficient = cls_loss_coefficient
+        self.bbox_loss_coefficient = bbox_loss_coefficient
+        self.giou_loss_coefficient = giou_loss_coefficient
+        self.focal_alpha = focal_alpha
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+
+class ConditionalDETROnnxConfig(OnnxConfig):
+
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("pixel_mask", {0: "batch"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-5
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 12
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000..2d7abc3c088b5
--- /dev/null
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,298 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert CONDITIONAL_DETR checkpoints."""
+
+
+import argparse
+import json
+from collections import OrderedDict
+from pathlib import Path
+
+import torch
+from PIL import Image
+
+import requests
+from huggingface_hub import hf_hub_download
+from transformers import (
+    ConditionalDETRConfig,
+    ConditionalDETRFeatureExtractor,
+    ConditionalDETRForObjectDetection,
+    ConditionalDETRForSegmentation,
+)
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+rename_keys = []
+for i in range(6):
+    # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
+    )
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
+    )
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
+    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
+    )
+    rename_keys.append(
+        (
+            f"transformer.decoder.layers.{i}.cross_attn.out_proj.weight",
+            f"decoder.layers.{i}.encoder_attn.out_proj.weight",
+        )
+    )
+    rename_keys.append(
+        (
+            f"transformer.decoder.layers.{i}.cross_attn.out_proj.bias",
+            f"decoder.layers.{i}.encoder_attn.out_proj.bias",
+        )
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
+
+    # q, k, v projections in self/cross-attention in decoder for conditional DETR
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.weight", f"decoder.layers.{i}.sa_v_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight"))
+    # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.weight", f"decoder.layers.{i}.ca_qpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.weight", f"decoder.layers.{i}.ca_v_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight"))
+
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.bias", f"decoder.layers.{i}.sa_qpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.bias", f"decoder.layers.{i}.sa_kpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.bias", f"decoder.layers.{i}.sa_v_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias"))
+    # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.bias", f"decoder.layers.{i}.ca_qpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.bias", f"decoder.layers.{i}.ca_kpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.bias", f"decoder.layers.{i}.ca_v_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias"))
+
+# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
+# for conditional DETR, also convert reference point head and query scale MLP
+rename_keys.extend(
+    [
+        ("input_proj.weight", "input_projection.weight"),
+        ("input_proj.bias", "input_projection.bias"),
+        ("query_embed.weight", "query_position_embeddings.weight"),
+        ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
+        ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
+        ("class_embed.weight", "class_labels_classifier.weight"),
+        ("class_embed.bias", "class_labels_classifier.bias"),
+        ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
+        ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
+        ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
+        ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
+        ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
+        ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
+        ("transformer.decoder.ref_point_head.layers.0.weight", "decoder.ref_point_head.layers.0.weight"),
+        ("transformer.decoder.ref_point_head.layers.0.bias", "decoder.ref_point_head.layers.0.bias"),
+        ("transformer.decoder.ref_point_head.layers.1.weight", "decoder.ref_point_head.layers.1.weight"),
+        ("transformer.decoder.ref_point_head.layers.1.bias", "decoder.ref_point_head.layers.1.bias"),
+        ("transformer.decoder.query_scale.layers.0.weight", "decoder.query_scale.layers.0.weight"),
+        ("transformer.decoder.query_scale.layers.0.bias", "decoder.query_scale.layers.0.bias"),
+        ("transformer.decoder.query_scale.layers.1.weight", "decoder.query_scale.layers.1.weight"),
+        ("transformer.decoder.query_scale.layers.1.bias", "decoder.query_scale.layers.1.bias"),
+        ("transformer.decoder.layers.0.ca_qpos_proj.weight", "decoder.layers.0.ca_qpos_proj.weight"),
+        ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias")
+    ]
+)
+
+
+def rename_key(state_dict, old, new):
+    val = state_dict.pop(old)
+    state_dict[new] = val
+
+
+def rename_backbone_keys(state_dict):
+    new_state_dict = OrderedDict()
+    for key, value in state_dict.items():
+        if "backbone.0.body" in key:
+            new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
+            new_state_dict[new_key] = value
+        else:
+            new_state_dict[key] = value
+
+    return new_state_dict
+
+
+def read_in_q_k_v(state_dict, is_panoptic=False):
+    prefix = ""
+    if is_panoptic:
+        prefix = "conditional_detr."
+
+    # first: transformer encoder
+    for i in range(6):
+        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+
+    return im
+
+
+@torch.no_grad()
+def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our CONDITIONAL_DETR structure.
+    """
+
+    # load default config
+    config = ConditionalDETRConfig()
+    # set backbone and dilation attributes
+    if "resnet101" in model_name:
+        config.backbone = "resnet101"
+    if "dc5" in model_name:
+        config.dilation = True
+    is_panoptic = "panoptic" in model_name
+    if is_panoptic:
+        config.num_labels = 250
+    else:
+        config.num_labels = 91
+        repo_id = "datasets/huggingface/label-files"
+        filename = "coco-detection-id2label.json"
+        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
+        id2label = {int(k): v for k, v in id2label.items()}
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
+
+    # load feature extractor
+    format = "coco_panoptic" if is_panoptic else "coco_detection"
+    feature_extractor = ConditionalDETRFeatureExtractor(format=format)
+
+    # prepare image
+    img = prepare_img()
+    encoding = feature_extractor(images=img, return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+
+    logger.info(f"Converting model {model_name}...")
+
+    # load original model from torch hub
+    conditional_detr = torch.hub.load("DeppMeng/ConditionalDETR", model_name, pretrained=True).eval()
+    state_dict = conditional_detr.state_dict()
+    # rename keys
+    for src, dest in rename_keys:
+        if is_panoptic:
+            src = "conditional_detr." + src
+        rename_key(state_dict, src, dest)
+    state_dict = rename_backbone_keys(state_dict)
+    # query, key and value matrices need special treatment
+    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
+    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
+    prefix = "conditional_detr.model." if is_panoptic else "model."
+    for key in state_dict.copy().keys():
+        if is_panoptic:
+            if (
+                key.startswith("conditional_detr")
+                and not key.startswith("class_labels_classifier")
+                and not key.startswith("bbox_predictor")
+            ):
+                val = state_dict.pop(key)
+                state_dict["conditional_detr.model" + key[4:]] = val
+            elif "class_labels_classifier" in key or "bbox_predictor" in key:
+                val = state_dict.pop(key)
+                state_dict["conditional_detr." + key] = val
+            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
+                continue
+            else:
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+        else:
+            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+    # finally, create HuggingFace model and load state dict
+    model = ConditionalDETRForSegmentation(config) if is_panoptic else ConditionalDETRForObjectDetection(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+    # verify our conversion
+    original_outputs = conditional_detr(pixel_values)
+    outputs = model(pixel_values)
+    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4)
+    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4)
+    if is_panoptic:
+        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
+
+    # Save model and feature extractor
+    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    model.save_pretrained(pytorch_dump_folder_path)
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_name",
+        default="conditional_detr_resnet50",
+        type=str,
+        help="Name of the CONDITIONAL_DETR model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_conditional_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
new file mode 100644
index 0000000000000..07550124c157e
--- /dev/null
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -0,0 +1,938 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for CONDITIONAL_DETR."""
+
+import io
+import pathlib
+from collections import defaultdict
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+from PIL import Image
+
+from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor
+from ...utils import TensorType, is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+logger = logging.get_logger(__name__)
+
+
+ImageInput = Union[Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"]]
+
+
+# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+def center_to_corners_format(x):
+    """
+    Converts a PyTorch tensor of bounding boxes of center format (center_x, center_y, width, height) to corners format
+    (x_0, y_0, x_1, y_1).
+    """
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def corners_to_center_format(x):
+    """
+    Converts a NumPy array of bounding boxes of shape (number of bounding boxes, 4) of corners format (x_0, y_0, x_1,
+    y_1) to center format (center_x, center_y, width, height).
+    """
+    x_transposed = x.T
+    x0, y0, x1, y1 = x_transposed[0], x_transposed[1], x_transposed[2], x_transposed[3]
+    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
+    return np.stack(b, axis=-1)
+
+
+def masks_to_boxes(masks):
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+
+    Returns a [N, 4] tensor, with the boxes in corner (xyxy) format.
+    """
+    if masks.size == 0:
+        return np.zeros((0, 4))
+
+    h, w = masks.shape[-2:]
+
+    y = np.arange(0, h, dtype=np.float32)
+    x = np.arange(0, w, dtype=np.float32)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = np.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * np.expand_dims(x, axis=0)
+    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
+    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
+    x_min = x.filled(fill_value=1e8)
+    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
+
+    y_mask = masks * np.expand_dims(y, axis=0)
+    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
+    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
+    y_min = y.filled(fill_value=1e8)
+    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
+
+    return np.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
+def rgb_to_id(color):
+    if isinstance(color, np.ndarray) and len(color.shape) == 3:
+        if color.dtype == np.uint8:
+            color = color.astype(np.int32)
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+    return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+
+def id_to_rgb(id_map):
+    if isinstance(id_map, np.ndarray):
+        id_map_copy = id_map.copy()
+        rgb_shape = tuple(list(id_map.shape) + [3])
+        rgb_map = np.zeros(rgb_shape, dtype=np.uint8)
+        for i in range(3):
+            rgb_map[..., i] = id_map_copy % 256
+            id_map_copy //= 256
+        return rgb_map
+    color = []
+    for _ in range(3):
+        color.append(id_map % 256)
+        id_map //= 256
+    return color
+
+
+class ConditionalDETRFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+    r"""
+    Constructs a CONDITIONAL_DETR feature extractor.
+
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+
+    Args:
+        format (`str`, *optional*, defaults to `"coco_detection"`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int`, *optional*, defaults to 800):
+            Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a
+            sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
+            the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
+            height / width, size)`.
+        max_size (`int`, *optional*, defaults to `1333`):
+            The largest size an image dimension can have (otherwise it's capped). Only has an effect if `do_resize` is
+            set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
+            The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
+        image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
+            ImageNet std.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        format="coco_detection",
+        do_resize=True,
+        size=800,
+        max_size=1333,
+        do_normalize=True,
+        image_mean=None,
+        image_std=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.format = self._is_valid_format(format)
+        self.do_resize = do_resize
+        self.size = size
+        self.max_size = max_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.485, 0.456, 0.406]  # ImageNet mean
+        self.image_std = image_std if image_std is not None else [0.229, 0.224, 0.225]  # ImageNet std
+
+    def _is_valid_format(self, format):
+        if format not in ["coco_detection", "coco_panoptic"]:
+            raise ValueError(f"Format {format} not supported")
+        return format
+
+    def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
+        if self.format == "coco_detection":
+            image, target = self.prepare_coco_detection(image, target, return_segmentation_masks)
+            return image, target
+        elif self.format == "coco_panoptic":
+            image, target = self.prepare_coco_panoptic(image, target, masks_path)
+            return image, target
+        else:
+            raise ValueError(f"Format {self.format} not supported")
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33
+    def convert_coco_poly_to_mask(self, segmentations, height, width):
+
+        try:
+            from pycocotools import mask as coco_mask
+        except ImportError:
+            raise ImportError("Pycocotools is not installed in your environment.")
+
+        masks = []
+        for polygons in segmentations:
+            rles = coco_mask.frPyObjects(polygons, height, width)
+            mask = coco_mask.decode(rles)
+            if len(mask.shape) < 3:
+                mask = mask[..., None]
+            mask = np.asarray(mask, dtype=np.uint8)
+            mask = np.any(mask, axis=2)
+            masks.append(mask)
+        if masks:
+            masks = np.stack(masks, axis=0)
+        else:
+            masks = np.zeros((0, height, width), dtype=np.uint8)
+
+        return masks
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50
+    def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
+        """
+        Convert the target in COCO format into the format expected by CONDITIONAL_DETR.
+        """
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = np.asarray([image_id], dtype=np.int64)
+
+        # get all COCO annotations for the given image
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=w)
+        boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = np.asarray(classes, dtype=np.int64)
+
+        if return_segmentation_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = self.convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = np.asarray(keypoints, dtype=np.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.reshape((-1, 3))
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if return_segmentation_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["class_labels"] = classes
+        if return_segmentation_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = np.asarray([obj["area"] for obj in anno], dtype=np.float32)
+        iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno], dtype=np.int64)
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+
+        target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+        target["size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+
+        return image, target
+
+    def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
+        w, h = image.size
+        ann_info = target.copy()
+        ann_path = pathlib.Path(masks_path) / ann_info["file_name"]
+
+        if "segments_info" in ann_info:
+            masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
+            masks = rgb_to_id(masks)
+
+            ids = np.array([ann["id"] for ann in ann_info["segments_info"]])
+            masks = masks == ids[:, None, None]
+            masks = np.asarray(masks, dtype=np.uint8)
+
+            labels = np.asarray([ann["category_id"] for ann in ann_info["segments_info"]], dtype=np.int64)
+
+        target = {}
+        target["image_id"] = np.asarray(
+            [ann_info["image_id"] if "image_id" in ann_info else ann_info["id"]], dtype=np.int64
+        )
+        if return_masks:
+            target["masks"] = masks
+        target["class_labels"] = labels
+
+        target["boxes"] = masks_to_boxes(masks)
+
+        target["size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+        target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+        if "segments_info" in ann_info:
+            target["iscrowd"] = np.asarray([ann["iscrowd"] for ann in ann_info["segments_info"]], dtype=np.int64)
+            target["area"] = np.asarray([ann["area"] for ann in ann_info["segments_info"]], dtype=np.float32)
+
+        return image, target
+
+    def _resize(self, image, size, target=None, max_size=None):
+        """
+        Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
+        edge of the image will be matched to this number.
+
+        If given, also resize the target accordingly.
+        """
+        if not isinstance(image, Image.Image):
+            image = self.to_pil_image(image)
+
+        def get_size_with_aspect_ratio(image_size, size, max_size=None):
+            w, h = image_size
+            if max_size is not None:
+                min_original_size = float(min((w, h)))
+                max_original_size = float(max((w, h)))
+                if max_original_size / min_original_size * size > max_size:
+                    size = int(round(max_size * min_original_size / max_original_size))
+
+            if (w <= h and w == size) or (h <= w and h == size):
+                return (h, w)
+
+            if w < h:
+                ow = size
+                oh = int(size * h / w)
+            else:
+                oh = size
+                ow = int(size * w / h)
+
+            return (oh, ow)
+
+        def get_size(image_size, size, max_size=None):
+            if isinstance(size, (list, tuple)):
+                return size
+            else:
+                # size returned must be (w, h) since we use PIL to resize images
+                # so we revert the tuple
+                return get_size_with_aspect_ratio(image_size, size, max_size)[::-1]
+
+        size = get_size(image.size, size, max_size)
+        rescaled_image = self.resize(image, size=size)
+
+        if target is None:
+            return rescaled_image, None
+
+        ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+        ratio_width, ratio_height = ratios
+
+        target = target.copy()
+        if "boxes" in target:
+            boxes = target["boxes"]
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            target["boxes"] = scaled_boxes
+
+        if "area" in target:
+            area = target["area"]
+            scaled_area = area * (ratio_width * ratio_height)
+            target["area"] = scaled_area
+
+        w, h = size
+        target["size"] = np.asarray([h, w], dtype=np.int64)
+
+        if "masks" in target:
+            # use PyTorch as current workaround
+            # TODO replace by self.resize
+            masks = torch.from_numpy(target["masks"][:, None]).float()
+            interpolated_masks = nn.functional.interpolate(masks, size=(h, w), mode="nearest")[:, 0] > 0.5
+            target["masks"] = interpolated_masks.numpy()
+
+        return rescaled_image, target
+
+    def _normalize(self, image, mean, std, target=None):
+        """
+        Normalize the image with a certain mean and std.
+
+        If given, also normalize the target bounding boxes based on the size of the image.
+        """
+
+        image = self.normalize(image, mean=mean, std=std)
+        if target is None:
+            return image, None
+
+        target = target.copy()
+        h, w = image.shape[-2:]
+
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = corners_to_center_format(boxes)
+            boxes = boxes / np.asarray([w, h, w, h], dtype=np.float32)
+            target["boxes"] = boxes
+
+        return image, target
+
+    def __call__(
+        self,
+        images: ImageInput,
+        annotations: Union[List[Dict], List[List[Dict]]] = None,
+        return_segmentation_masks: Optional[bool] = False,
+        masks_path: Optional[pathlib.Path] = None,
+        pad_and_return_pixel_mask: Optional[bool] = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several image(s) and optional annotations. Images are by default
+        padded up to the largest image in a batch, and a pixel mask is created that indicates which pixels are
+        real/which are padding.
+
+        <Tip warning={true}>
+
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            annotations (`Dict`, `List[Dict]`, *optional*):
+                The corresponding annotations in COCO format.
+
+                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_detection"`, the
+                annotations for each image should have the following format: {'image_id': int, 'annotations':
+                [annotation]}, with the annotations being a list of COCO object annotations.
+
+                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the
+                annotations for each image should have the following format: {'image_id': int, 'file_name': str,
+                'segments_info': [segment_info]} with segments_info being a list of COCO panoptic annotations.
+
+            return_segmentation_masks (`Dict`, `List[Dict]`, *optional*, defaults to `False`):
+                Whether to also include instance segmentation masks as part of the labels in case `format =
+                "coco_detection"`.
+
+            masks_path (`pathlib.Path`, *optional*):
+                Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only
+                relevant in case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
+
+            pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether or not to pad images up to the largest image in a batch and create a pixel mask.
+
+                If left to the default, will return a pixel mask that is:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
+                objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model.
+            - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
+              *"pixel_mask"* is in `self.model_input_names`).
+            - **labels** -- Optional labels to be fed to a model (when `annotations` are provided)
+        """
+        # Input type checking for clearer error
+
+        valid_images = False
+        valid_annotations = False
+        valid_masks_path = False
+
+        # Check that images has a valid type
+        if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
+            valid_images = True
+        elif isinstance(images, (list, tuple)):
+            if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
+                valid_images = True
+
+        if not valid_images:
+            raise ValueError(
+                "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
+                "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
+            )
+
+        is_batched = bool(
+            isinstance(images, (list, tuple))
+            and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
+        )
+
+        # Check that annotations has a valid type
+        if annotations is not None:
+            if not is_batched:
+                if self.format == "coco_detection":
+                    if isinstance(annotations, dict) and "image_id" in annotations and "annotations" in annotations:
+                        if isinstance(annotations["annotations"], (list, tuple)):
+                            # an image can have no annotations
+                            if len(annotations["annotations"]) == 0 or isinstance(annotations["annotations"][0], dict):
+                                valid_annotations = True
+                elif self.format == "coco_panoptic":
+                    if isinstance(annotations, dict) and "image_id" in annotations and "segments_info" in annotations:
+                        if isinstance(annotations["segments_info"], (list, tuple)):
+                            # an image can have no segments (?)
+                            if len(annotations["segments_info"]) == 0 or isinstance(
+                                annotations["segments_info"][0], dict
+                            ):
+                                valid_annotations = True
+            else:
+                if isinstance(annotations, (list, tuple)):
+                    if len(images) != len(annotations):
+                        raise ValueError("There must be as many annotations as there are images")
+                    if isinstance(annotations[0], Dict):
+                        if self.format == "coco_detection":
+                            if isinstance(annotations[0]["annotations"], (list, tuple)):
+                                valid_annotations = True
+                        elif self.format == "coco_panoptic":
+                            if isinstance(annotations[0]["segments_info"], (list, tuple)):
+                                valid_annotations = True
+
+            if not valid_annotations:
+                raise ValueError(
+                    """
+                    Annotations must of type `Dict` (single image) or `List[Dict]` (batch of images). In case of object
+                    detection, each dictionary should contain the keys 'image_id' and 'annotations', with the latter
+                    being a list of annotations in COCO format. In case of panoptic segmentation, each dictionary
+                    should contain the keys 'file_name', 'image_id' and 'segments_info', with the latter being a list
+                    of annotations in COCO format.
+                    """
+                )
+
+        # Check that masks_path has a valid type
+        if masks_path is not None:
+            if self.format == "coco_panoptic":
+                if isinstance(masks_path, pathlib.Path):
+                    valid_masks_path = True
+                if not valid_masks_path:
+                    raise ValueError(
+                        "The path to the directory containing the mask PNG files should be provided as a"
+                        " `pathlib.Path` object."
+                    )
+
+        if not is_batched:
+            images = [images]
+            if annotations is not None:
+                annotations = [annotations]
+
+        # prepare (COCO annotations as a list of Dict -> CONDITIONAL_DETR target as a single Dict per image)
+        if annotations is not None:
+            for idx, (image, target) in enumerate(zip(images, annotations)):
+                if not isinstance(image, Image.Image):
+                    image = self.to_pil_image(image)
+                image, target = self.prepare(image, target, return_segmentation_masks, masks_path)
+                images[idx] = image
+                annotations[idx] = target
+
+        # transformations (resizing + normalization)
+        if self.do_resize and self.size is not None:
+            if annotations is not None:
+                for idx, (image, target) in enumerate(zip(images, annotations)):
+                    image, target = self._resize(image=image, target=target, size=self.size, max_size=self.max_size)
+                    images[idx] = image
+                    annotations[idx] = target
+            else:
+                for idx, image in enumerate(images):
+                    images[idx] = self._resize(image=image, target=None, size=self.size, max_size=self.max_size)[0]
+
+        if self.do_normalize:
+            if annotations is not None:
+                for idx, (image, target) in enumerate(zip(images, annotations)):
+                    image, target = self._normalize(
+                        image=image, mean=self.image_mean, std=self.image_std, target=target
+                    )
+                    images[idx] = image
+                    annotations[idx] = target
+            else:
+                images = [
+                    self._normalize(image=image, mean=self.image_mean, std=self.image_std)[0] for image in images
+                ]
+
+        if pad_and_return_pixel_mask:
+            # pad images up to largest image in batch and create pixel_mask
+            max_size = self._max_by_axis([list(image.shape) for image in images])
+            c, h, w = max_size
+            padded_images = []
+            pixel_mask = []
+            for image in images:
+                # create padded image
+                padded_image = np.zeros((c, h, w), dtype=np.float32)
+                padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
+                padded_images.append(padded_image)
+                # create pixel mask
+                mask = np.zeros((h, w), dtype=np.int64)
+                mask[: image.shape[1], : image.shape[2]] = True
+                pixel_mask.append(mask)
+            images = padded_images
+
+        # return as BatchFeature
+        data = {}
+        data["pixel_values"] = images
+        if pad_and_return_pixel_mask:
+            data["pixel_mask"] = pixel_mask
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        if annotations is not None:
+            # Convert to TensorType
+            tensor_type = return_tensors
+            if not isinstance(tensor_type, TensorType):
+                tensor_type = TensorType(tensor_type)
+
+            if not tensor_type == TensorType.PYTORCH:
+                raise ValueError("Only PyTorch is supported for the moment.")
+            else:
+                if not is_torch_available():
+                    raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
+
+                encoded_inputs["labels"] = [
+                    {k: torch.from_numpy(v) for k, v in target.items()} for target in annotations
+                ]
+
+        return encoded_inputs
+
+    def _max_by_axis(self, the_list):
+        # type: (List[List[int]]) -> List[int]
+        maxes = the_list[0]
+        for sublist in the_list[1:]:
+            for index, item in enumerate(sublist):
+                maxes[index] = max(maxes[index], item)
+        return maxes
+
+    def pad_and_create_pixel_mask(
+        self, pixel_values_list: List["torch.Tensor"], return_tensors: Optional[Union[str, TensorType]] = None
+    ):
+        """
+        Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
+
+        Args:
+            pixel_values_list (`List[torch.Tensor]`):
+                List of images (pixel values) to be padded. Each image should be a tensor of shape (C, H, W).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
+                objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model.
+            - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
+              *"pixel_mask"* is in `self.model_input_names`).
+
+        """
+
+        max_size = self._max_by_axis([list(image.shape) for image in pixel_values_list])
+        c, h, w = max_size
+        padded_images = []
+        pixel_mask = []
+        for image in pixel_values_list:
+            # create padded image
+            padded_image = np.zeros((c, h, w), dtype=np.float32)
+            padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
+            padded_images.append(padded_image)
+            # create pixel mask
+            mask = np.zeros((h, w), dtype=np.int64)
+            mask[: image.shape[1], : image.shape[2]] = True
+            pixel_mask.append(mask)
+
+        # return as BatchFeature
+        data = {"pixel_values": padded_images, "pixel_mask": pixel_mask}
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        return encoded_inputs
+
+    # POSTPROCESSING METHODS
+    # inspired by https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py#L258
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the output of [`ConditionalDETRForObjectDetection`] into the format expected by the COCO api. Only
+        supports PyTorch.
+
+        Args:
+            outputs ([`ConditionalDETRObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation). For visualization, this should be the image size after data
+                augment, but before padding.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        scores = topk_values
+        topk_boxes = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+    def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
+        """
+        Converts the output of [`ConditionalDETRForSegmentation`] into image segmentation predictions. Only supports
+        PyTorch.
+
+        Parameters:
+            outputs ([`ConditionalDETRSegmentationOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
+            threshold (`float`, *optional*, defaults to 0.9):
+                Threshold to use to filter out queries.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, raw_masks = outputs.logits, outputs.pred_masks
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.cpu().tolist())
+
+        for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
+            # we filter empty queries and detection below threshold
+            scores, labels = cur_logits.softmax(-1).max(-1)
+            keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
+            cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
+            cur_scores = cur_scores[keep]
+            cur_classes = cur_classes[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1
+
+            predictions = {"scores": cur_scores, "labels": cur_classes, "masks": cur_masks}
+            preds.append(predictions)
+        return preds
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
+    def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
+        """
+        Converts the output of [`ConditionalDETRForSegmentation`] into actual instance segmentation predictions. Only
+        supports PyTorch.
+
+        Args:
+            results (`List[Dict]`):
+                Results list obtained by [`~ConditionalDETRFeatureExtractor.post_process`], to which "masks" results
+                will be added.
+            outputs ([`ConditionalDETRSegmentationOutput`]):
+                Raw outputs of the model.
+            orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation).
+            max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation).
+            threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an
+            image in the batch as predicted by the model.
+        """
+
+        if len(orig_target_sizes) != len(max_target_sizes):
+            raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes")
+        max_h, max_w = max_target_sizes.max(0)[0].tolist()
+        outputs_masks = outputs.pred_masks.squeeze(2)
+        outputs_masks = nn.functional.interpolate(
+            outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
+        )
+        outputs_masks = (outputs_masks.sigmoid() > threshold).cpu()
+
+        for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
+            img_h, img_w = t[0], t[1]
+            results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
+            results[i]["masks"] = nn.functional.interpolate(
+                results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
+            ).byte()
+
+        return results
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
+    def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
+        """
+        Converts the output of [`ConditionalDETRForSegmentation`] into actual panoptic predictions. Only supports
+        PyTorch.
+
+        Parameters:
+            outputs ([`ConditionalDETRSegmentationOutput`]):
+                Raw outputs of the model.
+            processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
+                augmentation but before batching.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`, *optional*):
+                Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. If left to
+                None, it will default to the `processed_sizes`.
+            is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
+                Dictionary mapping class indices to either True or False, depending on whether or not they are a thing.
+                If not set, defaults to the `is_thing_map` of COCO panoptic.
+            threshold (`float`, *optional*, defaults to 0.85):
+                Threshold to use to filter out queries.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for
+            an image in the batch as predicted by the model.
+        """
+        if target_sizes is None:
+            target_sizes = processed_sizes
+        if len(processed_sizes) != len(target_sizes):
+            raise ValueError("Make sure to pass in as many processed_sizes as target_sizes")
+
+        if is_thing_map is None:
+            # default to is_thing_map of COCO panoptic
+            is_thing_map = {i: i <= 90 for i in range(201)}
+
+        out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes
+        if not len(out_logits) == len(raw_masks) == len(target_sizes):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks"
+            )
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.cpu().tolist())
+
+        for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
+            out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
+        ):
+            # we filter empty queries and detection below threshold
+            scores, labels = cur_logits.softmax(-1).max(-1)
+            keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
+            cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
+            cur_scores = cur_scores[keep]
+            cur_classes = cur_classes[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_boxes = center_to_corners_format(cur_boxes[keep])
+
+            h, w = cur_masks.shape[-2:]
+            if len(cur_boxes) != len(cur_classes):
+                raise ValueError("Not as many boxes as there are classes")
+
+            # It may be that we have several predicted masks for the same stuff class.
+            # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+            cur_masks = cur_masks.flatten(1)
+            stuff_equiv_classes = defaultdict(lambda: [])
+            for k, label in enumerate(cur_classes):
+                if not is_thing_map[label.item()]:
+                    stuff_equiv_classes[label.item()].append(k)
+
+            def get_ids_area(masks, scores, dedup=False):
+                # This helper function creates the final panoptic segmentation image
+                # It also returns the area of the masks that appears on the image
+
+                m_id = masks.transpose(0, 1).softmax(-1)
+
+                if m_id.shape[-1] == 0:
+                    # We didn't detect any mask :(
+                    m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
+                else:
+                    m_id = m_id.argmax(-1).view(h, w)
+
+                if dedup:
+                    # Merge the masks corresponding to the same stuff class
+                    for equiv in stuff_equiv_classes.values():
+                        if len(equiv) > 1:
+                            for eq_id in equiv:
+                                m_id.masked_fill_(m_id.eq(eq_id), equiv[0])
+
+                final_h, final_w = to_tuple(target_size)
+
+                seg_img = Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy()))
+                seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST)
+
+                np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes()))
+                np_seg_img = np_seg_img.view(final_h, final_w, 3)
+                np_seg_img = np_seg_img.numpy()
+
+                m_id = torch.from_numpy(rgb_to_id(np_seg_img))
+
+                area = []
+                for i in range(len(scores)):
+                    area.append(m_id.eq(i).sum().item())
+                return area, seg_img
+
+            area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
+            if cur_classes.numel() > 0:
+                # We know filter empty masks as long as we find some
+                while True:
+                    filtered_small = torch.as_tensor(
+                        [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device
+                    )
+                    if filtered_small.any().item():
+                        cur_scores = cur_scores[~filtered_small]
+                        cur_classes = cur_classes[~filtered_small]
+                        cur_masks = cur_masks[~filtered_small]
+                        area, seg_img = get_ids_area(cur_masks, cur_scores)
+                    else:
+                        break
+
+            else:
+                cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device)
+
+            segments_info = []
+            for i, a in enumerate(area):
+                cat = cur_classes[i].item()
+                segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a})
+            del cur_classes
+
+            with io.BytesIO() as out:
+                seg_img.save(out, format="PNG")
+                predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+            preds.append(predictions)
+        return preds
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
new file mode 100644
index 0000000000000..1f63d22be27a5
--- /dev/null
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -0,0 +1,2547 @@
+# coding=utf-8
+# Copyright 2022 Attn4Vis and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Conditional DETR model."""
+
+
+import math
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_int_div
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_scipy_available,
+    is_timm_available,
+    is_vision_available,
+    logging,
+    replace_return_docstrings,
+    requires_backends,
+)
+from .configuration_conditional_detr import ConditionalDETRConfig
+
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+if is_vision_available():
+    from .feature_extraction_conditional_detr import center_to_corners_format
+
+if is_timm_available():
+    from timm import create_model
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "ConditionalDETRConfig"
+_CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
+
+CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Atten4Vis/ConditionalDETR",
+    # See all conditional_detr models at https://huggingface.co/models?filter=conditional_detr
+]
+
+
+
+@dataclass
+class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
+    """
+    Base class for outputs of the Conditional DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+            layernorm.
+    """
+
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    reference_points: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class ConditionalDETRModelOutput(Seq2SeqModelOutput):
+    """
+    Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+            layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+            layernorm.
+    """
+
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    reference_points: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->ConditionalDETR
+class ConditionalDETRObjectDetectionOutput(ModelOutput):
+    """
+    Output type of [`ConditionalDETRForObjectDetection`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
+            boxes.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+            layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.detr.modeling_detr.DetrSegmentationOutput with Detr->ConditionalDETR
+class ConditionalDETRSegmentationOutput(ModelOutput):
+    """
+    Output type of [`ConditionalDETRForSegmentation`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
+            boxes.
+        pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
+            Segmentation masks logits for all queries. See also [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
+            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation masks
+            respectively.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+            layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    pred_masks: torch.FloatTensor = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+# BELOW: utilities copied from
+# https://github.com/facebookresearch/detr/blob/master/backbone.py
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDETR
+class ConditionalDETRFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super(ConditionalDETRFrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(ConditionalDETRFrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->ConditionalDETR
+def replace_batch_norm(m, name=""):
+    for attr_str in dir(m):
+        target_attr = getattr(m, attr_str)
+        if isinstance(target_attr, nn.BatchNorm2d):
+            frozen = ConditionalDETRFrozenBatchNorm2d(target_attr.num_features)
+            bn = getattr(m, attr_str)
+            frozen.weight.data.copy_(bn.weight)
+            frozen.bias.data.copy_(bn.bias)
+            frozen.running_mean.data.copy_(bn.running_mean)
+            frozen.running_var.data.copy_(bn.running_var)
+            setattr(m, attr_str, frozen)
+    for n, ch in m.named_children():
+        replace_batch_norm(ch, n)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrTimmConvEncoder with Detr->ConditionalDETR
+class ConditionalDETRTimmConvEncoder(nn.Module):
+    """
+    Convolutional encoder (backbone) from the timm library.
+
+    nn.BatchNorm2d layers are replaced by ConditionalDETRFrozenBatchNorm2d as defined above.
+
+    """
+
+    def __init__(self, name: str, dilation: bool):
+        super().__init__()
+
+        kwargs = {}
+        if dilation:
+            kwargs["output_stride"] = 16
+
+        requires_backends(self, ["timm"])
+
+        backbone = create_model(name, pretrained=True, features_only=True, out_indices=(1, 2, 3, 4), **kwargs)
+        # replace batch norm by frozen batch norm
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = self.model.feature_info.channels()
+
+        if "resnet" in name:
+            for name, parameter in self.model.named_parameters():
+                if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                    parameter.requires_grad_(False)
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values)
+
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->ConditionalDETR
+class ConditionalDETRConvModel(nn.Module):
+    """
+    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
+    """
+
+    def __init__(self, conv_encoder, position_embedding):
+        super().__init__()
+        self.conv_encoder = conv_encoder
+        self.position_embedding = position_embedding
+
+    def forward(self, pixel_values, pixel_mask):
+        # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples
+        out = self.conv_encoder(pixel_values, pixel_mask)
+        pos = []
+        for feature_map, mask in out:
+            # position encoding
+            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
+
+        return out, pos
+
+# Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDETR
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrSinePositionEmbedding with Detr->ConditionalDETR
+class ConditionalDETRSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, pixel_values, pixel_mask):
+        if pixel_mask is None:
+            raise ValueError("No pixel mask provided")
+        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
+
+        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
+        dim_t = self.temperature ** (2 * torch_int_div(dim_t, 2) / self.embedding_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->ConditionalDETR
+class ConditionalDETRLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, embedding_dim=256):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+    def forward(self, pixel_values, pixel_mask=None):
+        h, w = pixel_values.shape[-2:]
+        i = torch.arange(w, device=pixel_values.device)
+        j = torch.arange(h, device=pixel_values.device)
+        x_emb = self.column_embeddings(i)
+        y_emb = self.row_embeddings(j)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(h, 1, 1), y_emb.unsqueeze(1).repeat(1, w, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->ConditionalDETR
+def build_position_encoding(config):
+    n_steps = config.d_model // 2
+    if config.position_embedding_type == "sine":
+        # TODO find a better way of exposing other arguments
+        position_embedding = ConditionalDETRSinePositionEmbedding(n_steps, normalize=True)
+    elif config.position_embedding_type == "learned":
+        position_embedding = ConditionalDETRLearnedPositionEmbedding(n_steps)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+
+    return position_embedding
+
+# function to generate sine positional embedding for 2d coordinates
+def gen_sineembed_for_position(pos_tensor):
+    scale = 2 * math.pi
+    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000 ** (2 * (dim_t // 2) / 128)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos = torch.cat((pos_y, pos_x), dim=2)
+    return pos
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1/x2)
+
+class DetrAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the CONDITIONAL_DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        key_value_states: Optional[torch.Tensor] = None,
+        key_value_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # add key-value position embeddings to the key value states
+        if key_value_position_embeddings is not None:
+            key_value_states_original = key_value_states
+            key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states_original), -1, bsz)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states_original), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+class ConditionalDETRAttention(nn.Module):
+    """
+    Cross-Attention used in Conditional DETR 'Conditional DETR for Fast Training Convergence' paper.
+
+    The key q_proj, k_proj, v_proj are defined outside the attention.
+    This attention allows the dim of q, k to be different to v.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        out_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+            )
+        # head dimension of values
+        self.v_head_dim = out_dim // num_heads
+        if self.v_head_dim * num_heads != self.out_dim:
+            raise ValueError(
+                f"out_dim must be divisible by num_heads (got `out_dim`: {self.out_dim} and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.out_proj = nn.Linear(out_dim, out_dim, bias=bias)
+
+    def _qk_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _v_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        key_states: Optional[torch.Tensor] = None,
+        value_states: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = hidden_states * self.scaling
+        # get key, value proj
+        key_states = self._qk_shape(key_states, -1, bsz)
+        value_states = self._v_shape(value_states, -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        v_proj_shape = (bsz * self.num_heads, -1, self.v_head_dim)
+        query_states = self._qk_shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*v_proj_shape)
+
+        src_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.v_head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.v_head_dim)}, but is {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.v_head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.out_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+class ConditionalDETREncoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = DetrAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: torch.Tensor = None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_embeddings (`torch.FloatTensor`, *optional*): position embeddings, to be added to hidden_states.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class ConditionalDETRDecoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        d_model = config.d_model
+        # Decoder Self-Attention projections
+        self.sa_qcontent_proj = nn.Linear(d_model, d_model)
+        self.sa_qpos_proj = nn.Linear(d_model, d_model)
+        self.sa_kcontent_proj = nn.Linear(d_model, d_model)
+        self.sa_kpos_proj = nn.Linear(d_model, d_model)
+        self.sa_v_proj = nn.Linear(d_model, d_model)
+        
+        self.self_attn = ConditionalDETRAttention(
+            embed_dim=self.embed_dim,
+            out_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        
+
+        # Decoder Cross-Attention projections
+        self.ca_qcontent_proj = nn.Linear(d_model, d_model)
+        self.ca_qpos_proj = nn.Linear(d_model, d_model)
+        self.ca_kcontent_proj = nn.Linear(d_model, d_model)
+        self.ca_kpos_proj = nn.Linear(d_model, d_model)
+        self.ca_v_proj = nn.Linear(d_model, d_model)
+        self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)
+
+        self.encoder_attn = ConditionalDETRAttention(
+            self.embed_dim*2,
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.nhead = config.decoder_attention_heads
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        query_position_embeddings: Optional[torch.Tensor] = None,
+        query_sine_embed: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        is_first: Optional[bool] = False
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys
+            in the cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys
+            in the self-attention layer.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # ========== Begin of Self-Attention =============
+        # Apply projections here
+        # shape: num_queries x batch_size x 256
+        q_content = self.sa_qcontent_proj(hidden_states)      # target is the input of the first decoder layer. zero by default.
+        q_pos = self.sa_qpos_proj(query_position_embeddings)
+        k_content = self.sa_kcontent_proj(hidden_states)
+        k_pos = self.sa_kpos_proj(query_position_embeddings)
+        v = self.sa_v_proj(hidden_states)
+
+        bs, num_queries, n_model = q_content.shape
+        _, hw, _ = k_content.shape
+
+        q = q_content + q_pos
+        k = k_content + k_pos
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=q,
+            attention_mask=attention_mask,
+            key_states=k,
+            value_states=v,
+            output_attentions=output_attentions,
+        )
+        # ============ End of Self-Attention =============
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # ========== Begin of Cross-Attention =============
+        # Apply projections here
+        # shape: num_queries x batch_size x 256
+        q_content = self.ca_qcontent_proj(hidden_states)
+        k_content = self.ca_kcontent_proj(encoder_hidden_states)
+        v = self.ca_v_proj(encoder_hidden_states)
+
+        bs, num_queries, n_model = q_content.shape
+        _, hw, _ = k_content.shape
+
+        k_pos = self.ca_kpos_proj(position_embeddings)
+
+        # For the first decoder layer, we concatenate the positional embedding predicted from 
+        # the object query (the positional embedding) into the original query (key) in DETR.
+        if is_first:
+            q_pos = self.ca_qpos_proj(query_position_embeddings)
+            q = q_content + q_pos
+            k = k_content + k_pos
+        else:
+            q = q_content
+            k = k_content
+
+        q = q.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        query_sine_embed = self.ca_qpos_sine_proj(query_sine_embed)
+        query_sine_embed = query_sine_embed.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        q = torch.cat([q, query_sine_embed], dim=3).view(bs, num_queries, n_model * 2)
+        k = k.view(bs, hw, self.nhead, n_model//self.nhead)
+        k_pos = k_pos.view(bs, hw, self.nhead, n_model//self.nhead)
+        k = torch.cat([k, k_pos], dim=3).view(bs, hw, n_model * 2)
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=q,
+                attention_mask=encoder_attention_mask,
+                key_states=k,
+                value_states=v,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # ============ End of Cross-Attention =============
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead with Detr->ConditionalDETR
+class ConditionalDETRClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->ConditionalDETR
+class ConditionalDETRPreTrainedModel(PreTrainedModel):
+    config_class = ConditionalDETRConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        xavier_std = self.config.init_xavier_std
+
+        if isinstance(module, ConditionalDETRMHAttentionMap):
+            nn.init.zeros_(module.k_linear.bias)
+            nn.init.zeros_(module.q_linear.bias)
+            nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
+            nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
+        elif isinstance(module, ConditionalDETRLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, ConditionalDETRDecoder):
+            module.gradient_checkpointing = value
+
+
+CONDITIONAL_DETR_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`ConditionalDETRConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CONDITIONAL_DETR_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it.
+
+            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See [`ConditionalDETRFeatureExtractor.__call__`] for
+            details.
+
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR
+class ConditionalDETREncoder(ConditionalDETRPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`ConditionalDETREncoderLayer`].
+
+    The encoder updates the flattened feature map through multiple self-attention layers.
+
+    Small tweak for CONDITIONAL_DETR:
+
+    - position_embeddings are added to the forward pass.
+
+    Args:
+        config: ConditionalDETRConfig
+    """
+
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        self.layers = nn.ModuleList([ConditionalDETREncoderLayer(config) for _ in range(config.encoder_layers)])
+
+        # in the original CONDITIONAL_DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_embeddings=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+
+                [What are attention masks?](../glossary#attention-mask)
+
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                # we add position_embeddings as extra input to the encoder_layer
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    position_embeddings=position_embeddings,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class ConditionalDETRDecoder(ConditionalDETRPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`ConditionalDETRDecoderLayer`].
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some small tweaks for CONDITIONAL_DETR:
+
+    - position_embeddings and query_position_embeddings are added to the forward pass.
+    - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
+
+    Args:
+        config: ConditionalDETRConfig
+    """
+
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+
+        self.layers = nn.ModuleList([ConditionalDETRDecoderLayer(config) for _ in range(config.decoder_layers)])
+        # in CONDITIONAL_DETR, the decoder uses layernorm after the last decoder layer output
+        self.layernorm = nn.LayerNorm(config.d_model)
+        d_model = config.d_model
+        self.gradient_checkpointing = False
+
+        # query_scale is the FFN applied on f to generate transformation T
+        self.query_scale = MLP(d_model, d_model, d_model, 2)
+        self.ref_point_head = MLP(d_model, d_model, 2, 2)
+        for layer_id in range(config.decoder_layers - 1):
+            self.layers[layer_id + 1].ca_qpos_proj = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings=None,
+        query_position_embeddings=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`:
+
+                - 1 for queries that are **not masked**,
+                - 0 for queries that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Position embeddings that are added to the queries and keys in each cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+            input_shape = inputs_embeds.size()[:-1]
+
+        combined_attention_mask = None
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # optional intermediate hidden states
+        intermediate = () if self.config.auxiliary_loss else None
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        reference_points_before_sigmoid = self.ref_point_head(query_position_embeddings)    # [num_queries, batch_size, 2]
+        reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
+        obj_center = reference_points[..., :2].transpose(0, 1)  
+        # get sine embedding for the query vector
+        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center) 
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+            if idx == 0:
+                pos_transformation = 1
+            else:
+                pos_transformation = self.query_scale(hidden_states)  
+            # apply transformation
+            query_sine_embed = query_sine_embed_before_transformation * pos_transformation
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    combined_attention_mask,
+                    position_embeddings,
+                    query_position_embeddings,
+                    query_sine_embed,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=combined_attention_mask,
+                    position_embeddings=position_embeddings,
+                    query_position_embeddings=query_position_embeddings,
+                    query_sine_embed=query_sine_embed,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    output_attentions=output_attentions,
+                    is_first=(idx==0)
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if self.config.auxiliary_loss:
+                hidden_states = self.layernorm(hidden_states)
+                intermediate += (hidden_states,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # finally, apply layernorm
+        hidden_states = self.layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # stack intermediate decoder activations
+        if self.config.auxiliary_loss:
+            intermediate = torch.stack(intermediate)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate, reference_points]
+                if v is not None
+            )
+        return ConditionalDETRDecoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+            intermediate_hidden_states=intermediate,
+            reference_points=reference_points
+        )
+
+
+@add_start_docstrings(
+    """
+    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
+    any specific head on top.
+    """,
+    CONDITIONAL_DETR_START_DOCSTRING,
+)
+class ConditionalDETRModel(ConditionalDETRPreTrainedModel):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        # Create backbone + positional encoding
+        backbone = ConditionalDETRTimmConvEncoder(config.backbone, config.dilation)
+        position_embeddings = build_position_encoding(config)
+        self.backbone = ConditionalDETRConvModel(backbone, position_embeddings)
+
+        # Create projection layer
+        self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
+
+        self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
+
+        self.encoder = ConditionalDETREncoder(config)
+        self.decoder = ConditionalDETRDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
+
+    @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ConditionalDETRModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), device=device)
+
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # pixel_values should be of shape (batch_size, num_channels, height, width)
+        # pixel_mask should be of shape (batch_size, height, width)
+        features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+
+        # get final feature map and downsampled mask
+        feature_map, mask = features[-1]
+
+        if mask is None:
+            raise ValueError("Backbone does not return downsampled pixel mask")
+
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        projected_feature_map = self.input_projection(feature_map)
+
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
+
+        flattened_mask = mask.flatten(1)
+
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                position_embeddings=position_embeddings,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1)
+        queries = torch.zeros_like(query_position_embeddings)
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            position_embeddings=position_embeddings,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return ConditionalDETRModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            reference_points=decoder_outputs.reference_points
+        )
+
+
+@add_start_docstrings(
+    """
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
+    such as COCO detection.
+    """,
+    CONDITIONAL_DETR_START_DOCSTRING,
+)
+# Copied from transformers.models.detr.modeling_detr.DetrForObjectDetection with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
+class ConditionalDETRForObjectDetection(ConditionalDETRPreTrainedModel):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        # CONDITIONAL_DETR encoder-decoder model
+        self.model = ConditionalDETRModel(config)
+
+        # Object detection heads
+        self.class_labels_classifier = nn.Linear(
+            config.d_model, config.num_labels
+        )  # We add one for the "no object" class
+        self.bbox_predictor = ConditionalDETRMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+
+    @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ConditionalDETRObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForObjectDetection
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> # model predicts bounding boxes and corresponding COCO classes
+        >>> logits = outputs.logits
+        >>> bboxes = outputs.pred_boxes
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # First, sent images through CONDITIONAL_DETR base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        # class logits + predicted bounding boxes
+        logits = self.class_labels_classifier(sequence_output)
+
+        reference = outputs.reference_points if return_dict else outputs[-1]
+        reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
+        outputs_coords = []
+        hs = sequence_output
+        tmp = self.bbox_predictor(hs)
+        tmp[..., :2] += reference_before_sigmoid
+        pred_boxes = tmp.sigmoid()
+        # pred_boxes = self.bbox_predictor(sequence_output).sigmoid()
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = ConditionalDETRHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality"]
+            criterion = ConditionalDETRLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                focal_alpha=self.config.focal_alpha,
+                losses=losses,
+            )
+            criterion.to(self.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            if self.config.auxiliary_loss:
+                intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4]
+                outputs_class = self.class_labels_classifier(intermediate)
+
+                for lvl in range(hs.shape[0]):
+                    tmp = self.bbox_predictor(hs[lvl])
+                    tmp[..., :2] += reference_before_sigmoid
+                    outputs_coord = tmp.sigmoid()
+                    outputs_coords.append(outputs_coord)
+                outputs_coord = torch.stack(outputs_coords)
+
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": self.config.cls_loss_coefficient, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return ConditionalDETRObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
+    such as COCO panoptic.
+
+    """,
+    CONDITIONAL_DETR_START_DOCSTRING,
+)
+# Copied from transformers.models.detr.modeling_detr.DetrForSegmentation with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
+class ConditionalDETRForSegmentation(ConditionalDETRPreTrainedModel):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        # object detection model
+        self.conditional_detr = ConditionalDETRForObjectDetection(config)
+
+        # segmentation head
+        hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
+        intermediate_channel_sizes = self.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes
+
+        self.mask_head = ConditionalDETRMaskHeadSmallConv(
+            hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
+        )
+
+        self.bbox_attention = ConditionalDETRMHAttentionMap(
+            hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ConditionalDETRSegmentationOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each
+            dictionary containing at least the following 3 keys: 'class_labels', 'boxes' and 'masks' (the class labels,
+            bounding boxes and segmentation masks of an image in the batch respectively). The class labels themselves
+            should be a `torch.LongTensor` of len `(number of bounding boxes in the image,)`, the boxes a
+            `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)` and the masks a
+            `torch.FloatTensor` of shape `(number of bounding boxes in the image, height, width)`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
+        >>> model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> # model predicts COCO classes, bounding boxes, and masks
+        >>> logits = outputs.logits
+        >>> bboxes = outputs.pred_boxes
+        >>> masks = outputs.pred_masks
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones((batch_size, height, width), device=device)
+
+        # First, get list of feature maps and position embeddings
+        features, position_embeddings_list = self.conditional_detr.model.backbone(pixel_values, pixel_mask=pixel_mask)
+
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        feature_map, mask = features[-1]
+        batch_size, num_channels, height, width = feature_map.shape
+        projected_feature_map = self.conditional_detr.model.input_projection(feature_map)
+
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
+
+        flattened_mask = mask.flatten(1)
+
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.conditional_detr.model.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                position_embeddings=position_embeddings,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.conditional_detr.model.query_position_embeddings.weight.unsqueeze(0).repeat(
+            batch_size, 1, 1
+        )
+        queries = torch.zeros_like(query_position_embeddings)
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.conditional_detr.model.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            position_embeddings=position_embeddings,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        # Sixth, compute logits, pred_boxes and pred_masks
+        logits = self.conditional_detr.class_labels_classifier(sequence_output)
+        pred_boxes = self.conditional_detr.bbox_predictor(sequence_output).sigmoid()
+
+        memory = encoder_outputs[0].permute(0, 2, 1).view(batch_size, self.config.d_model, height, width)
+        mask = flattened_mask.view(batch_size, height, width)
+
+        # FIXME h_boxes takes the last one computed, keep this in mind
+        # important: we need to reverse the mask, since in the original implementation the mask works reversed
+        # bbox_mask is of shape (batch_size, num_queries, number_of_attention_heads in bbox_attention, height/32, width/32)
+        bbox_mask = self.bbox_attention(sequence_output, memory, mask=~mask)
+
+        seg_masks = self.mask_head(projected_feature_map, bbox_mask, [features[2][0], features[1][0], features[0][0]])
+
+        pred_masks = seg_masks.view(batch_size, self.conditional_detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = ConditionalDETRHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality", "masks"]
+            criterion = ConditionalDETRLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                focal_alpha=self.config.focal_alpha,
+                losses=losses,
+            )
+            criterion.to(self.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            outputs_loss["pred_masks"] = pred_masks
+            if self.config.auxiliary_loss:
+                intermediate = decoder_outputs.intermediate_hidden_states if return_dict else decoder_outputs[-1]
+                outputs_class = self.class_labels_classifier(intermediate)
+                outputs_coord = self.bbox_predictor(intermediate).sigmoid()
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            weight_dict["loss_mask"] = self.config.mask_loss_coefficient
+            weight_dict["loss_dice"] = self.config.dice_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes, pred_masks) + auxiliary_outputs + decoder_outputs + encoder_outputs
+            else:
+                output = (logits, pred_boxes, pred_masks) + decoder_outputs + encoder_outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return ConditionalDETRSegmentationOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            pred_masks=pred_masks,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+def _expand(tensor, length: int):
+    return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1)
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/segmentation.py
+# Copied from transformers.models.detr.modeling_detr.DetrMaskHeadSmallConv with Detr->ConditionalDETR
+class ConditionalDETRMaskHeadSmallConv(nn.Module):
+    """
+    Simple convolutional head, using group norm. Upsampling is done using a FPN approach
+    """
+
+    def __init__(self, dim, fpn_dims, context_dim):
+        super().__init__()
+
+        if dim % 8 != 0:
+            raise ValueError(
+                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in GroupNorm is set to 8"
+            )
+
+        inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
+
+        self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
+        self.gn1 = nn.GroupNorm(8, dim)
+        self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
+        self.gn2 = nn.GroupNorm(8, inter_dims[1])
+        self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
+        self.gn3 = nn.GroupNorm(8, inter_dims[2])
+        self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
+        self.gn4 = nn.GroupNorm(8, inter_dims[3])
+        self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
+        self.gn5 = nn.GroupNorm(8, inter_dims[4])
+        self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)
+
+        self.dim = dim
+
+        self.adapter1 = nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
+        self.adapter2 = nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
+        self.adapter3 = nn.Conv2d(fpn_dims[2], inter_dims[3], 1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
+        # here we concatenate x, the projected feature map, of shape (batch_size, d_model, heigth/32, width/32) with
+        # the bbox_mask = the attention maps of shape (batch_size, n_queries, n_heads, height/32, width/32).
+        # We expand the projected feature map to match the number of heads.
+        x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
+
+        x = self.lay1(x)
+        x = self.gn1(x)
+        x = nn.functional.relu(x)
+        x = self.lay2(x)
+        x = self.gn2(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter1(fpns[0])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay3(x)
+        x = self.gn3(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter2(fpns[1])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay4(x)
+        x = self.gn4(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter3(fpns[2])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay5(x)
+        x = self.gn5(x)
+        x = nn.functional.relu(x)
+
+        x = self.out_lay(x)
+        return x
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->ConditionalDETR
+class ConditionalDETRMHAttentionMap(nn.Module):
+    """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
+
+    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.dropout = nn.Dropout(dropout)
+
+        self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+        self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+
+        self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5
+
+    def forward(self, q, k, mask: Optional[Tensor] = None):
+        q = self.q_linear(q)
+        k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
+        queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
+        keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
+        weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)
+
+        if mask is not None:
+            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf"))
+        weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
+        weights = self.dropout(weights)
+        return weights
+
+
+def dice_loss(inputs, targets, num_boxes):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_boxes
+
+
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+# taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
+class ConditionalDETRLoss(nn.Module):
+    """
+    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process happens in two steps: 1)
+    we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
+    of matched ground-truth / prediction (supervise class and box).
+
+
+
+    Args:
+        matcher (`ConditionalDETRHungarianMatcher`):
+            Module able to compute a matching between targets and proposals.
+        num_classes (`int`):
+            Number of object categories, omitting the special no-object category.
+        focal_alpha (`float`):
+            Alpha parmeter in focal loss.
+        losses (`List[str]`):
+            List of all the losses to be applied. See `get_loss` for a list of all available losses.
+    """
+
+    def __init__(self, matcher, num_classes, focal_alpha, losses):
+        super().__init__()
+        self.matcher = matcher
+        self.num_classes = num_classes
+        self.focal_alpha = focal_alpha
+        self.losses = losses
+
+    # removed logging parameter, which was part of the original implementation
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor of dim
+        [nb_target_boxes]
+        """
+        if "logits" not in outputs:
+            raise KeyError("No logits were found in the outputs")
+        src_logits = outputs["logits"]
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_o
+
+        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2]+1],
+                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:,:,:-1]
+        loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) * src_logits.shape[1]
+        losses = {"loss_ce": loss_ce}
+
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
+
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+        """
+        logits = outputs["logits"]
+        device = logits.device
+        tgt_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+        card_err = nn.functional.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
+
+        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
+        are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        if "pred_boxes" not in outputs:
+            raise KeyError("No predicted boxes found in outputs")
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        loss_bbox = nn.functional.l1_loss(src_boxes, target_boxes, reduction="none")
+
+        losses = {}
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(
+            generalized_box_iou(center_to_corners_format(src_boxes), center_to_corners_format(target_boxes))
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the masks: the focal loss and the dice loss.
+
+        Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
+        """
+        if "pred_masks" not in outputs:
+            raise KeyError("No predicted masks found in outputs")
+
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+
+        # upsample predictions to the target size
+        src_masks = nn.functional.interpolate(
+            src_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
+        )
+        src_masks = src_masks[:, 0].flatten(1)
+
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(src_masks.shape)
+        losses = {
+            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
+            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
+        }
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+            "masks": self.loss_masks,
+        }
+        if loss not in loss_map:
+            raise ValueError(f"Loss {loss} not supported")
+        return loss_map[loss](outputs, targets, indices, num_boxes)
+
+    def forward(self, outputs, targets):
+        """
+        This performs the loss computation.
+
+        Args:
+             outputs (`dict`, *optional*):
+                Dictionary of tensors, see the output specification of the model for the format.
+             targets (`List[dict]`, *optional*):
+                List of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the
+                losses applied, see each loss' doc.
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["class_labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        # (Niels): comment out function below, distributed training to be added
+        # if is_dist_avail_and_initialized():
+        #     torch.distributed.all_reduce(num_boxes)
+        # (Niels) in original implementation, num_boxes is divided by get_world_size()
+        num_boxes = torch.clamp(num_boxes, min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "auxiliary_outputs" in outputs:
+            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
+                indices = self.matcher(auxiliary_outputs, targets)
+                for loss in self.losses:
+                    if loss == "masks":
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->ConditionalDETR,detr->conditional_detr
+class ConditionalDETRMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+# taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/matcher.py
+class ConditionalDETRHungarianMatcher(nn.Module):
+    """
+    This class computes an assignment between the targets and the predictions of the network.
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
+    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+
+    Args:
+        class_cost:
+            The relative weight of the classification error in the matching cost.
+        bbox_cost:
+            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
+        giou_cost:
+            The relative weight of the giou loss of the bounding box in the matching cost.
+    """
+
+    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
+        super().__init__()
+        requires_backends(self, ["scipy"])
+
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        if class_cost == 0 or bbox_cost == 0 or giou_cost == 0:
+            raise ValueError("All costs of the Matcher can't be 0")
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Args:
+            outputs (`dict`):
+                A dictionary that contains at least these entries:
+                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
+            targets (`List[dict]`):
+                A list of targets (len(targets) = batch_size), where each target is a dict containing:
+                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
+                  ground-truth
+                 objects in the target) containing the class labels
+                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
+
+        Returns:
+            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
+            - index_i is the indices of the selected predictions (in order)
+            - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v["class_labels"] for v in targets])
+        tgt_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost.
+        alpha = 0.25
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        class_cost = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
+
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, tgt_bbox, p=1)
+
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(tgt_bbox))
+
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+# below: bounding box utilities taken from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+
+
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
+
+    Args:
+        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
+            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
+            < x2` and `0 <= y1 < y2`.
+
+    Returns:
+        `torch.FloatTensor`: a tensor containing the area for each box.
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
+    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
+
+    Returns:
+        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = box_iou(boxes1, boxes2)
+
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+# below: taken from https://github.com/facebookresearch/detr/blob/master/util/misc.py#L306
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    if tensor_list[0].ndim == 3:
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("Only 3-dimensional tensors are supported")
+    return NestedTensor(tensor, mask)
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index e1f4f3b1fd9fa..f61b8dfdda282 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -24,6 +24,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class ConditionalDETRFeatureExtractor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ConvNextFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/conditional_detr/__init__.py b/tests/models/conditional_detr/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
new file mode 100644
index 0000000000000..b5e19fe005da7
--- /dev/null
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -0,0 +1,338 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import pathlib
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ConditionalDETRFeatureExtractor
+
+
+class ConditionalDETRFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=18,
+        max_size=1333,  # by setting max_size > max_resolution we're effectively not testing this :p
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.max_size = max_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "max_size": self.max_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to ConditionalDETRFeatureExtractor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size * h / w)
+                expected_width = self.size
+            elif w > h:
+                expected_height = self.size
+                expected_width = int(self.size * w / h)
+            else:
+                expected_height = self.size
+                expected_width = self.size
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+
+@require_torch
+@require_vision
+class ConditionalDETRFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+
+    feature_extraction_class = ConditionalDETRFeatureExtractor if is_vision_available() else None
+
+    def setUp(self):
+        self.feature_extract_tester = ConditionalDETRFeatureExtractionTester(self)
+
+    @property
+    def feat_extract_dict(self):
+        return self.feature_extract_tester.prepare_feat_extract_dict()
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "image_mean"))
+        self.assertTrue(hasattr(feature_extractor, "image_std"))
+        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
+        self.assertTrue(hasattr(feature_extractor, "do_resize"))
+        self.assertTrue(hasattr(feature_extractor, "size"))
+        self.assertTrue(hasattr(feature_extractor, "max_size"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PIL images
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random numpy tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_equivalence_pad_and_create_pixel_mask(self):
+        # Initialize feature_extractors
+        feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
+        feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test whether the method "pad_and_return_pixel_mask" and calling the feature extractor return the same tensors
+        encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
+        encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
+
+        assert torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
+        assert torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
+
+    @slow
+    def test_call_pytorch_with_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"image_id": 39769, "annotations": target}
+
+        # encode them
+        feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+
+        # verify area
+        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
+        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+        assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+        # verify class_labels
+        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+        assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
+
+    @slow
+    def test_call_pytorch_with_coco_panoptic_annotations(self):
+        # prepare image, target and masks_path
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+
+        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
+
+        # encode them
+        # TODO replace by .from_pretrained Atten4Vis/ConditionalDETR-panoptic
+        feature_extractor = ConditionalDETRFeatureExtractor(format="coco_panoptic")
+        encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+
+        # verify area
+        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
+        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+        assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+        # verify class_labels
+        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+        assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)
+        # verify masks
+        expected_masks_sum = 822338
+        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
new file mode 100644
index 0000000000000..38933f26c6627
--- /dev/null
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -0,0 +1,538 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch CONDITIONAL_DETR model. """
+
+
+import inspect
+import math
+import unittest
+
+from transformers import ConditionalDETRConfig, is_timm_available, is_vision_available
+from transformers.testing_utils import require_timm, require_vision, slow, torch_device
+from transformers.utils import cached_property
+
+from ...generation.test_generation_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+
+
+if is_timm_available():
+    import torch
+
+    from transformers import ConditionalDETRForObjectDetection, ConditionalDETRForSegmentation, ConditionalDETRModel
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ConditionalDETRFeatureExtractor
+
+
+class ConditionalDETRModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=8,
+        is_training=True,
+        use_labels=True,
+        hidden_size=256,
+        num_hidden_layers=2,
+        num_attention_heads=8,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        num_queries=12,
+        num_channels=3,
+        min_size=200,
+        max_size=200,
+        n_targets=8,
+        num_labels=91,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_queries = num_queries
+        self.num_channels = num_channels
+        self.min_size = min_size
+        self.max_size = max_size
+        self.n_targets = n_targets
+        self.num_labels = num_labels
+
+        # we also set the expected seq length for both encoder and decoder
+        self.encoder_seq_length = math.ceil(self.min_size / 32) * math.ceil(self.max_size / 32)
+        self.decoder_seq_length = self.num_queries
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size])
+
+        pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device)
+
+        labels = None
+        if self.use_labels:
+            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
+            labels = []
+            for i in range(self.batch_size):
+                target = {}
+                target["class_labels"] = torch.randint(
+                    high=self.num_labels, size=(self.n_targets,), device=torch_device
+                )
+                target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device)
+                target["masks"] = torch.rand(self.n_targets, self.min_size, self.max_size, device=torch_device)
+                labels.append(target)
+
+        config = self.get_config()
+        return config, pixel_values, pixel_mask, labels
+
+    def get_config(self):
+        return ConditionalDETRConfig(
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            num_queries=self.num_queries,
+            num_labels=self.num_labels,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
+        return config, inputs_dict
+
+    def create_and_check_conditional_detr_model(self, config, pixel_values, pixel_mask, labels):
+        model = ConditionalDETRModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_conditional_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
+        model = ConditionalDETRForObjectDetection(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
+
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+
+@require_timm
+class ConditionalDETRModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            ConditionalDETRModel,
+            ConditionalDETRForObjectDetection,
+            ConditionalDETRForSegmentation,
+        )
+        if is_timm_available()
+        else ()
+    )
+    is_encoder_decoder = True
+    test_torchscript = False
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+
+    # special case for head models
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ in ["ConditionalDETRForObjectDetection", "ConditionalDETRForSegmentation"]:
+                labels = []
+                for i in range(self.model_tester.batch_size):
+                    target = {}
+                    target["class_labels"] = torch.ones(
+                        size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long
+                    )
+                    target["boxes"] = torch.ones(
+                        self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float
+                    )
+                    target["masks"] = torch.ones(
+                        self.model_tester.n_targets,
+                        self.model_tester.min_size,
+                        self.model_tester.max_size,
+                        device=torch_device,
+                        dtype=torch.float,
+                    )
+                    labels.append(target)
+                inputs_dict["labels"] = labels
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = ConditionalDETRModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ConditionalDETRConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_conditional_detr_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_conditional_detr_model(*config_and_inputs)
+
+    def test_conditional_detr_object_detection_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_conditional_detr_object_detection_head_model(*config_and_inputs)
+
+    @unittest.skip(reason="CONDITIONAL_DETR does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="CONDITIONAL_DETR does not have a get_input_embeddings method")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="CONDITIONAL_DETR is not a generative model")
+    def test_generate_without_input_ids(self):
+        pass
+
+    @unittest.skip(reason="CONDITIONAL_DETR does not use token embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    @slow
+    def test_model_outputs_equivalence(self):
+        # TODO Niels: fix me!
+        pass
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        decoder_seq_length = self.model_tester.decoder_seq_length
+        encoder_seq_length = self.model_tester.encoder_seq_length
+        decoder_key_length = self.model_tester.decoder_seq_length
+        encoder_key_length = self.model_tester.encoder_seq_length
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            if self.is_encoder_decoder:
+                correct_outlen = 6
+
+                # loss is at first position
+                if "labels" in inputs_dict:
+                    correct_outlen += 1  # loss is added to beginning
+                # Object Detection model returns pred_logits and pred_boxes
+                if model_class.__name__ == "ConditionalDETRForObjectDetection":
+                    correct_outlen += 1
+                # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks
+                if model_class.__name__ == "ConditionalDETRForSegmentation":
+                    correct_outlen += 2
+                if "past_key_values" in outputs:
+                    correct_outlen += 1  # past_key_values have been returned
+
+                self.assertEqual(out_len, correct_outlen)
+
+                # decoder attentions
+                decoder_attentions = outputs.decoder_attentions
+                self.assertIsInstance(decoder_attentions, (list, tuple))
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(decoder_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                )
+
+                # cross attentions
+                cross_attentions = outputs.cross_attentions
+                self.assertIsInstance(cross_attentions, (list, tuple))
+                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(cross_attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        decoder_seq_length,
+                        encoder_key_length,
+                    ],
+                )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        output = outputs[0]
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_attentions = outputs.encoder_attentions[0]
+        encoder_hidden_states.retain_grad()
+        encoder_attentions.retain_grad()
+
+        decoder_attentions = outputs.decoder_attentions[0]
+        decoder_attentions.retain_grad()
+
+        cross_attentions = outputs.cross_attentions[0]
+        cross_attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(encoder_attentions.grad)
+        self.assertIsNotNone(decoder_attentions.grad)
+        self.assertIsNotNone(cross_attentions.grad)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model.config.is_encoder_decoder:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                expected_arg_names.extend(
+                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
+                    if "head_mask" and "decoder_head_mask" in arg_names
+                    else []
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+            else:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_different_timm_backbone(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # let's pick a random timm backbone
+        config.backbone = "tf_mobilenetv3_small_075"
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if model_class.__name__ == "ConditionalDETRForObjectDetection":
+                expected_shape = (
+                    self.model_tester.batch_size,
+                    self.model_tester.num_queries,
+                    self.model_tester.num_labels,
+                )
+                self.assertEqual(outputs.logits.shape, expected_shape)
+
+            self.assertTrue(outputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        configs_no_init.init_xavier_std = 1e9
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if "bbox_attention" in name and "bias" not in name:
+                        self.assertLess(
+                            100000,
+                            abs(param.data.max().item()),
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+
+TOLERANCE = 1e-4
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_timm
+@require_vision
+@slow
+class ConditionalDETRModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return (
+            ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+            if is_vision_available()
+            else None
+        )
+
+    def test_inference_no_head(self):
+        model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**encoding)
+
+        expected_shape = torch.Size((1, 300, 256))
+        assert outputs.last_hidden_state.shape == expected_shape
+        expected_slice = torch.tensor(
+            [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
+    def test_inference_object_detection_head(self):
+        model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+        expected_slice_logits = torch.tensor(
+            [[-19.1194, -0.0893, -11.0154], [-17.3640, -1.8035, -14.0219], [-20.0461, -0.5837, -11.1060]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        expected_slice_boxes = torch.tensor(
+            [[0.4433, 0.5302, 0.8853], [0.5494, 0.2517, 0.0529], [0.4998, 0.5360, 0.9956]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+    def test_inference_panoptic_segmentation_head(self):
+        model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+        expected_slice_logits = torch.tensor(
+            [[-18.1565, -1.7568, -13.5029], [-16.8888, -1.4138, -14.1028], [-17.5709, -2.5080, -11.8654]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        expected_slice_boxes = torch.tensor(
+            [[0.5344, 0.1789, 0.9285], [0.4420, 0.0572, 0.0875], [0.6630, 0.6887, 0.1017]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+        expected_shape_masks = torch.Size((1, model.config.num_queries, 200, 267))
+        self.assertEqual(outputs.pred_masks.shape, expected_shape_masks)
+        expected_slice_masks = torch.tensor(
+            [[-7.7558, -10.8788, -11.9797], [-11.8881, -16.4329, -17.7451], [-14.7316, -19.7383, -20.3004]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_masks[0, 0, :3, :3], expected_slice_masks, atol=1e-3))

From 4f57d8eb7075133ce0a41376cd23aff02e3b8146 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:54:36 -0400
Subject: [PATCH 002/233] checked copies

---
 docs/source/en/index.mdx                      |  2 +-
 .../modeling_conditional_detr.py              | 63 ++++++++++++-------
 2 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 336f0e9d2b817..58901fbc647f3 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -68,7 +68,7 @@ The documentation is organized into five sections:
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[conditional_detr](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 1f63d22be27a5..4ce1f8f3d5776 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -153,8 +153,8 @@ class ConditionalDETRObjectDetectionOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
-            boxes.
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            unnormalized bounding boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
@@ -217,12 +217,13 @@ class ConditionalDETRSegmentationOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
-            boxes.
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            unnormalized bounding boxes.
         pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
-            Segmentation masks logits for all queries. See also [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
-            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation masks
-            respectively.
+            Segmentation masks logits for all queries. See also
+            [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
+            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation
+            masks respectively.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
@@ -326,7 +327,6 @@ def replace_batch_norm(m, name=""):
         replace_batch_norm(ch, n)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrTimmConvEncoder with Detr->ConditionalDETR
 class ConditionalDETRTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.
@@ -1581,7 +1581,6 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-# Copied from transformers.models.detr.modeling_detr.DetrForObjectDetection with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
 class ConditionalDETRForObjectDetection(ConditionalDETRPreTrainedModel):
     def __init__(self, config: ConditionalDETRConfig):
         super().__init__(config)
@@ -1670,7 +1669,7 @@ def forward(
 
         # class logits + predicted bounding boxes
         logits = self.class_labels_classifier(sequence_output)
-
+        
         reference = outputs.reference_points if return_dict else outputs[-1]
         reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
         outputs_coords = []
@@ -1755,7 +1754,6 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-# Copied from transformers.models.detr.modeling_detr.DetrForSegmentation with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
 class ConditionalDETRForSegmentation(ConditionalDETRPreTrainedModel):
     def __init__(self, config: ConditionalDETRConfig):
         super().__init__(config)
@@ -1807,22 +1805,40 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
-        >>> from PIL import Image
+        >>> import io
         >>> import requests
+        >>> from PIL import Image
+        >>> import torch
+        >>> import numpy
+
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
+        >>> from transformers.models.conditional_detr.feature_extraction_conditional_detr import rgb_to_id
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
-        >>> model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained(
+        ...     "facebook/conditional_detr-resnet-50-panoptic"
+        ... )
+        >>> model = ConditionalDETRForSegmentation.from_pretrained("facebook/conditional_detr-resnet-50-panoptic")
 
+        >>> # prepare image for the model
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
+
+        >>> # forward pass
         >>> outputs = model(**inputs)
-        >>> # model predicts COCO classes, bounding boxes, and masks
-        >>> logits = outputs.logits
-        >>> bboxes = outputs.pred_boxes
-        >>> masks = outputs.pred_masks
+
+        >>> # use the `post_process_panoptic` method of `ConditionalDETRFeatureExtractor` to convert to COCO format
+        >>> processed_sizes = torch.as_tensor(inputs["pixel_values"].shape[-2:]).unsqueeze(0)
+        >>> result = feature_extractor.post_process_panoptic(outputs, processed_sizes)[0]
+
+        >>> # the segmentation is stored in a special-format png
+        >>> panoptic_seg = Image.open(io.BytesIO(result["png_string"]))
+        >>> panoptic_seg = numpy.array(panoptic_seg, dtype=numpy.uint8)
+        >>> # retrieve the ids corresponding to each mask
+        >>> panoptic_seg_id = rgb_to_id(panoptic_seg)
+        >>> panoptic_seg_id.shape
+        (800, 1066)
         ```"""
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1903,7 +1919,9 @@ def forward(
 
         seg_masks = self.mask_head(projected_feature_map, bbox_mask, [features[2][0], features[1][0], features[0][0]])
 
-        pred_masks = seg_masks.view(batch_size, self.conditional_detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+        pred_masks = seg_masks.view(
+            batch_size, self.conditional_detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1]
+        )
 
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
@@ -1985,7 +2003,8 @@ def __init__(self, dim, fpn_dims, context_dim):
 
         if dim % 8 != 0:
             raise ValueError(
-                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in GroupNorm is set to 8"
+                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in"
+                " GroupNorm is set to 8"
             )
 
         inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
@@ -2077,7 +2096,7 @@ def forward(self, q, k, mask: Optional[Tensor] = None):
         weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)
 
         if mask is not None:
-            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf"))
+            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min)
         weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
         weights = self.dropout(weights)
         return weights

From f17ed0bee841dbf53376596a2060a19eff4504b0 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:55:14 -0400
Subject: [PATCH 003/233] checked copies

---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 4ce1f8f3d5776..1473df759cd8e 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -2347,7 +2347,6 @@ def forward(self, outputs, targets):
 
 
 # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
-# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->ConditionalDETR,detr->conditional_detr
 class ConditionalDETRMLPPredictionHead(nn.Module):
     """
     Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,

From a295255ae9dc8848ff502a7ac39161fc78d0f41a Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 01:22:01 -0400
Subject: [PATCH 004/233] fixed style and copies

---
 docs/source/en/_toctree.yml                   |   8 +-
 .../configuration_conditional_detr.py         |   8 -
 ..._original_pytorch_checkpoint_to_pytorch.py |  54 +++++--
 .../feature_extraction_conditional_detr.py    |   4 +-
 .../modeling_conditional_detr.py              | 144 +++++++++++-------
 5 files changed, 133 insertions(+), 85 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 6135a830181a4..6886b6b662bd0 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -42,7 +42,8 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - sections:
+    - isExpanded: false
+      sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -58,7 +59,6 @@
       - local: tasks/multiple_choice
         title: Multiple choice
       title: Task guides
-      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
@@ -358,6 +358,8 @@
       sections:
       - local: model_doc/beit
         title: BEiT
+      - local: model_doc/conditional_detr
+        title: ConditionalDETR
       - local: model_doc/convnext
         title: ConvNeXT
       - local: model_doc/cvt
@@ -495,4 +497,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
\ No newline at end of file
+  title: API
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index ce004dc6fc15c..bfb759ca246ca 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -211,14 +211,6 @@ def num_attention_heads(self) -> int:
     def hidden_size(self) -> int:
         return self.d_model
 
-    @property
-    def num_attention_heads(self) -> int:
-        return self.encoder_attention_heads
-
-    @property
-    def hidden_size(self) -> int:
-        return self.d_model
-
 
 class ConditionalDETROnnxConfig(OnnxConfig):
 
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index 2d7abc3c088b5..4b56d729d1f35 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -94,29 +94,55 @@
     rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
 
     # q, k, v projections in self/cross-attention in decoder for conditional DETR
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.weight", f"decoder.layers.{i}.sa_v_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight")
+    )
     # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.weight", f"decoder.layers.{i}.ca_qpos_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.weight", f"decoder.layers.{i}.ca_v_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight")
+    )
 
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.bias", f"decoder.layers.{i}.sa_qpos_proj.bias"))
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.bias", f"decoder.layers.{i}.sa_kpos_proj.bias"))
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.bias", f"decoder.layers.{i}.sa_v_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias")
+    )
     # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.bias", f"decoder.layers.{i}.ca_qpos_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.bias", f"decoder.layers.{i}.ca_kpos_proj.bias"))
     rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.bias", f"decoder.layers.{i}.ca_v_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias")
+    )
 
 # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
 # for conditional DETR, also convert reference point head and query scale MLP
@@ -144,7 +170,7 @@
         ("transformer.decoder.query_scale.layers.1.weight", "decoder.query_scale.layers.1.weight"),
         ("transformer.decoder.query_scale.layers.1.bias", "decoder.query_scale.layers.1.bias"),
         ("transformer.decoder.layers.0.ca_qpos_proj.weight", "decoder.layers.0.ca_qpos_proj.weight"),
-        ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias")
+        ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias"),
     ]
 )
 
diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 07550124c157e..f37e3e1fb69bf 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -704,8 +704,8 @@ def post_process(self, outputs, target_sizes):
         topk_boxes = topk_indexes // out_logits.shape[2]
         labels = topk_indexes % out_logits.shape[2]
         boxes = center_to_corners_format(out_bbox)
-        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
-        
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
         # and from relative [0, 1] to absolute [0, height] coordinates
         img_h, img_w = target_sizes.unbind(1)
         scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 1473df759cd8e..54dea544f3da9 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -62,13 +62,13 @@
 ]
 
 
-
 @dataclass
 class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
     """
-    Base class for outputs of the Conditional DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
-    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
-    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+    Base class for outputs of the Conditional DETR decoder. This class adds one attribute to
+    BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output
+    of each decoder layer, each of them gone through a layernorm. This is useful when training the model with auxiliary
+    decoding losses.
 
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -97,9 +97,10 @@ class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
 @dataclass
 class ConditionalDETRModelOutput(Seq2SeqModelOutput):
     """
-    Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
-    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
-    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+    Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to
+    Seq2SeqModelOutput, namely an optional stack of intermediate decoder activations, i.e. the output of each decoder
+    layer, each of them gone through a layernorm. This is useful when training the model with auxiliary decoding
+    losses.
 
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -389,6 +390,7 @@ def forward(self, pixel_values, pixel_mask):
 
         return out, pos
 
+
 # Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDETR
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
@@ -479,6 +481,7 @@ def build_position_encoding(config):
 
     return position_embedding
 
+
 # function to generate sine positional embedding for 2d coordinates
 def gen_sineembed_for_position(pos_tensor):
     scale = 2 * math.pi
@@ -493,11 +496,13 @@ def gen_sineembed_for_position(pos_tensor):
     pos = torch.cat((pos_y, pos_x), dim=2)
     return pos
 
+
 def inverse_sigmoid(x, eps=1e-5):
     x = x.clamp(min=0, max=1)
     x1 = x.clamp(min=eps)
     x2 = (1 - x).clamp(min=eps)
-    return torch.log(x1/x2)
+    return torch.log(x1 / x2)
+
 
 class DetrAttention(nn.Module):
     """
@@ -521,7 +526,8 @@ def __init__(
         self.head_dim = embed_dim // num_heads
         if self.head_dim * num_heads != self.embed_dim:
             raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
             )
         self.scaling = self.head_dim**-0.5
 
@@ -585,7 +591,8 @@ def forward(
 
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
             )
 
         if attention_mask is not None:
@@ -614,7 +621,8 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
             )
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
@@ -625,12 +633,13 @@ def forward(
 
         return attn_output, attn_weights_reshaped
 
+
 class ConditionalDETRAttention(nn.Module):
     """
     Cross-Attention used in Conditional DETR 'Conditional DETR for Fast Training Convergence' paper.
 
-    The key q_proj, k_proj, v_proj are defined outside the attention.
-    This attention allows the dim of q, k to be different to v.
+    The key q_proj, k_proj, v_proj are defined outside the attention. This attention allows the dim of q, k to be
+    different to v.
     """
 
     def __init__(
@@ -649,7 +658,8 @@ def __init__(
         self.head_dim = embed_dim // num_heads
         if self.head_dim * num_heads != self.embed_dim:
             raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
             )
         # head dimension of values
         self.v_head_dim = out_dim // num_heads
@@ -663,6 +673,7 @@ def __init__(
 
     def _qk_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
     def _v_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim).transpose(1, 2).contiguous()
 
@@ -696,7 +707,8 @@ def forward(
 
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
             )
 
         if attention_mask is not None:
@@ -725,7 +737,8 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.v_head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.v_head_dim)}, but is {attn_output.size()}"
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.v_head_dim)}, but is"
+                f" {attn_output.size()}"
             )
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.v_head_dim)
@@ -818,19 +831,18 @@ def __init__(self, config: ConditionalDETRConfig):
         self.sa_kcontent_proj = nn.Linear(d_model, d_model)
         self.sa_kpos_proj = nn.Linear(d_model, d_model)
         self.sa_v_proj = nn.Linear(d_model, d_model)
-        
+
         self.self_attn = ConditionalDETRAttention(
             embed_dim=self.embed_dim,
             out_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
-            dropout=config.attention_dropout
+            dropout=config.attention_dropout,
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        
 
         # Decoder Cross-Attention projections
         self.ca_qcontent_proj = nn.Linear(d_model, d_model)
@@ -841,10 +853,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)
 
         self.encoder_attn = ConditionalDETRAttention(
-            self.embed_dim*2,
-            self.embed_dim,
-            config.decoder_attention_heads,
-            dropout=config.attention_dropout
+            self.embed_dim * 2, self.embed_dim, config.decoder_attention_heads, dropout=config.attention_dropout
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
@@ -862,7 +871,7 @@ def forward(
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
-        is_first: Optional[bool] = False
+        is_first: Optional[bool] = False,
     ):
         """
         Args:
@@ -888,7 +897,9 @@ def forward(
         # ========== Begin of Self-Attention =============
         # Apply projections here
         # shape: num_queries x batch_size x 256
-        q_content = self.sa_qcontent_proj(hidden_states)      # target is the input of the first decoder layer. zero by default.
+        q_content = self.sa_qcontent_proj(
+            hidden_states
+        )  # target is the input of the first decoder layer. zero by default.
         q_pos = self.sa_qpos_proj(query_position_embeddings)
         k_content = self.sa_kcontent_proj(hidden_states)
         k_pos = self.sa_kpos_proj(query_position_embeddings)
@@ -924,7 +935,7 @@ def forward(
 
         k_pos = self.ca_kpos_proj(position_embeddings)
 
-        # For the first decoder layer, we concatenate the positional embedding predicted from 
+        # For the first decoder layer, we concatenate the positional embedding predicted from
         # the object query (the positional embedding) into the original query (key) in DETR.
         if is_first:
             q_pos = self.ca_qpos_proj(query_position_embeddings)
@@ -934,12 +945,12 @@ def forward(
             q = q_content
             k = k_content
 
-        q = q.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        q = q.view(bs, num_queries, self.nhead, n_model // self.nhead)
         query_sine_embed = self.ca_qpos_sine_proj(query_sine_embed)
-        query_sine_embed = query_sine_embed.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        query_sine_embed = query_sine_embed.view(bs, num_queries, self.nhead, n_model // self.nhead)
         q = torch.cat([q, query_sine_embed], dim=3).view(bs, num_queries, n_model * 2)
-        k = k.view(bs, hw, self.nhead, n_model//self.nhead)
-        k_pos = k_pos.view(bs, hw, self.nhead, n_model//self.nhead)
+        k = k.view(bs, hw, self.nhead, n_model // self.nhead)
+        k_pos = k_pos.view(bs, hw, self.nhead, n_model // self.nhead)
         k = torch.cat([k, k_pos], dim=3).view(bs, hw, n_model * 2)
 
         # Cross-Attention Block
@@ -996,8 +1007,9 @@ def forward(self, hidden_states: torch.Tensor):
         hidden_states = self.out_proj(hidden_states)
         return hidden_states
 
+
 class MLP(nn.Module):
-    """ Very simple multi-layer perceptron (also called FFN)"""
+    """Very simple multi-layer perceptron (also called FFN)"""
 
     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
         super().__init__()
@@ -1066,8 +1078,8 @@ def _set_gradient_checkpointing(self, module, value=False):
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it.
 
-            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See [`ConditionalDETRFeatureExtractor.__call__`] for
-            details.
+            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See
+            [`ConditionalDETRFeatureExtractor.__call__`] for details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -1323,11 +1335,13 @@ def forward(
         all_self_attns = () if output_attentions else None
         all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
 
-        reference_points_before_sigmoid = self.ref_point_head(query_position_embeddings)    # [num_queries, batch_size, 2]
+        reference_points_before_sigmoid = self.ref_point_head(
+            query_position_embeddings
+        )  # [num_queries, batch_size, 2]
         reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
-        obj_center = reference_points[..., :2].transpose(0, 1)  
+        obj_center = reference_points[..., :2].transpose(0, 1)
         # get sine embedding for the query vector
-        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center) 
+        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center)
 
         for idx, decoder_layer in enumerate(self.layers):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1339,7 +1353,7 @@ def forward(
             if idx == 0:
                 pos_transformation = 1
             else:
-                pos_transformation = self.query_scale(hidden_states)  
+                pos_transformation = self.query_scale(hidden_states)
             # apply transformation
             query_sine_embed = query_sine_embed_before_transformation * pos_transformation
             if self.gradient_checkpointing and self.training:
@@ -1372,7 +1386,7 @@ def custom_forward(*inputs):
                     encoder_hidden_states=encoder_hidden_states,
                     encoder_attention_mask=encoder_attention_mask,
                     output_attentions=output_attentions,
-                    is_first=(idx==0)
+                    is_first=(idx == 0),
                 )
 
             hidden_states = layer_outputs[0]
@@ -1401,7 +1415,14 @@ def custom_forward(*inputs):
         if not return_dict:
             return tuple(
                 v
-                for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate, reference_points]
+                for v in [
+                    hidden_states,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                    intermediate,
+                    reference_points,
+                ]
                 if v is not None
             )
         return ConditionalDETRDecoderOutput(
@@ -1410,14 +1431,14 @@ def custom_forward(*inputs):
             attentions=all_self_attns,
             cross_attentions=all_cross_attentions,
             intermediate_hidden_states=intermediate,
-            reference_points=reference_points
+            reference_points=reference_points,
         )
 
 
 @add_start_docstrings(
     """
-    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
-    any specific head on top.
+    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+    hidden-states without any specific head on top.
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
@@ -1570,14 +1591,14 @@ def forward(
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
             intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
-            reference_points=decoder_outputs.reference_points
+            reference_points=decoder_outputs.reference_points,
         )
 
 
 @add_start_docstrings(
     """
-    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
-    such as COCO detection.
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
+    top, for tasks such as COCO detection.
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
@@ -1669,7 +1690,7 @@ def forward(
 
         # class logits + predicted bounding boxes
         logits = self.class_labels_classifier(sequence_output)
-        
+
         reference = outputs.reference_points if return_dict else outputs[-1]
         reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
         outputs_coords = []
@@ -1748,8 +1769,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
-    such as COCO panoptic.
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top,
+    for tasks such as COCO panoptic.
 
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
@@ -2154,9 +2175,9 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
 # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
 class ConditionalDETRLoss(nn.Module):
     """
-    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process happens in two steps: 1)
-    we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
-    of matched ground-truth / prediction (supervise class and box).
+    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process
+    happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2)
+    we supervise each pair of matched ground-truth / prediction (supervise class and box).
 
 
 
@@ -2181,8 +2202,8 @@ def __init__(self, matcher, num_classes, focal_alpha, losses):
     # removed logging parameter, which was part of the original implementation
     def loss_labels(self, outputs, targets, indices, num_boxes):
         """
-        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor of dim
-        [nb_target_boxes]
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
+        of dim [nb_target_boxes]
         """
         if "logits" not in outputs:
             raise KeyError("No logits were found in the outputs")
@@ -2195,12 +2216,19 @@ def loss_labels(self, outputs, targets, indices, num_boxes):
         )
         target_classes[idx] = target_classes_o
 
-        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2]+1],
-                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
+        target_classes_onehot = torch.zeros(
+            [src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],
+            dtype=src_logits.dtype,
+            layout=src_logits.layout,
+            device=src_logits.device,
+        )
         target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
 
-        target_classes_onehot = target_classes_onehot[:,:,:-1]
-        loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) * src_logits.shape[1]
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = (
+            sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
+            * src_logits.shape[1]
+        )
         losses = {"loss_ce": loss_ce}
 
         return losses
@@ -2430,7 +2458,7 @@ def forward(self, outputs, targets):
         # Compute the classification cost.
         alpha = 0.25
         gamma = 2.0
-        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
         pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
         class_cost = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
 

From b1c7c2f92a91843ac273c27df4f04188922adb80 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 02:14:09 -0400
Subject: [PATCH 005/233] fixed style and copies

---
 .../conditional_detr/configuration_conditional_detr.py    | 8 ++++----
 ...itional_detr_original_pytorch_checkpoint_to_pytorch.py | 1 +
 utils/check_repo.py                                       | 2 ++
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index bfb759ca246ca..844260f0d12f5 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,7 +27,7 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DeppMeng/ConditionalDETR": "https://huggingface.co/DeppMeng/ConditionalDETR/resolve/main/config.json",
+    "DepuMeng/ConditionalDETR": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
 }
 
 
@@ -36,7 +36,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
     a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
-    [DeppMeng/ConditionalDETR](https://huggingface.co/DeppMeng/ConditionalDETR) architecture.
+    [DepuMeng/ConditionalDETR](https://huggingface.co/DepuMeng/ConditionalDETR) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -114,10 +114,10 @@ class ConditionalDETRConfig(PretrainedConfig):
     ```python
     >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
 
-    >>> # Initializing a CONDITIONAL_DETR DeppMeng/ConditionalDETR style configuration
+    >>> # Initializing a CONDITIONAL_DETR DepuMeng/ConditionalDETR style configuration
     >>> configuration = ConditionalDETRConfig()
 
-    >>> # Initializing a model from the DeppMeng/ConditionalDETR style configuration
+    >>> # Initializing a model from the DepuMeng/ConditionalDETR style configuration
     >>> model = ConditionalDETRModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index 4b56d729d1f35..f3c6fcadb2e2d 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -293,6 +293,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
     model = ConditionalDETRForSegmentation(config) if is_panoptic else ConditionalDETRForObjectDetection(config)
     model.load_state_dict(state_dict)
     model.eval()
+    model.push_to_hub(repo_id=model_name, organization="DepuMeng", commit_message="Add model")
     # verify our conversion
     original_outputs = conditional_detr(pixel_values)
     outputs = model(pixel_values)
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 207592c91c961..8ee629afdce57 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -58,6 +58,8 @@
     "DetrEncoder",  # Building part of bigger (tested) model.
     "DetrDecoder",  # Building part of bigger (tested) model.
     "DetrDecoderWrapper",  # Building part of bigger (tested) model.
+    "ConditionalDETREncoder",  # Building part of bigger (tested) model.
+    "ConditionalDETRDecoder",  # Building part of bigger (tested) model.
     "M2M100Encoder",  # Building part of bigger (tested) model.
     "M2M100Decoder",  # Building part of bigger (tested) model.
     "MCTCTEncoder",  # Building part of bigger (tested) model.

From af009ecf91556b8647c83f1a697429d27eeb2a75 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 02:18:56 -0400
Subject: [PATCH 006/233] fixed hub

---
 .../conditional_detr/configuration_conditional_detr.py    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 844260f0d12f5..3c05d959879ec 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,7 +27,7 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DepuMeng/ConditionalDETR": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
+    "DepuMeng/conditional_detr_resnet50": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
 }
 
 
@@ -36,7 +36,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
     a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
-    [DepuMeng/ConditionalDETR](https://huggingface.co/DepuMeng/ConditionalDETR) architecture.
+    [DepuMeng/conditional_detr_resnet50](https://huggingface.co/DepuMeng/conditional_detr_resnet50) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -114,10 +114,10 @@ class ConditionalDETRConfig(PretrainedConfig):
     ```python
     >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
 
-    >>> # Initializing a CONDITIONAL_DETR DepuMeng/ConditionalDETR style configuration
+    >>> # Initializing a CONDITIONAL_DETR DepuMeng/conditional_detr_resnet50 style configuration
     >>> configuration = ConditionalDETRConfig()
 
-    >>> # Initializing a model from the DepuMeng/ConditionalDETR style configuration
+    >>> # Initializing a model from the DepuMeng/conditional_detr_resnet50 style configuration
     >>> model = ConditionalDETRModel(configuration)
 
     >>> # Accessing the model configuration

From 3a02a971477dd4cfb877a9aa87b7da6ab487d384 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 02:27:32 -0400
Subject: [PATCH 007/233] fixed style

---
 .../models/conditional_detr/configuration_conditional_detr.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 3c05d959879ec..4291a6ecc4a94 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,7 +27,9 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DepuMeng/conditional_detr_resnet50": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
+    "DepuMeng/conditional_detr_resnet50": (
+        "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json"
+    ),
 }
 
 

From 380cb217a31fed75a9cd1ad8ea7b1a9dba04ddd5 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:33:08 -0400
Subject: [PATCH 008/233] Update README.md

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4c4096d971165..56747e897f451 100644
--- a/README.md
+++ b/README.md
@@ -276,7 +276,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.

From 705ae58f6038306a277dbb576c8cbbe021586351 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:33:17 -0400
Subject: [PATCH 009/233] Update docs/source/en/_toctree.yml

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/_toctree.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 6886b6b662bd0..8b32a281d3575 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -359,7 +359,7 @@
       - local: model_doc/beit
         title: BEiT
       - local: model_doc/conditional_detr
-        title: ConditionalDETR
+        title: Conditional DETR
       - local: model_doc/convnext
         title: ConvNeXT
       - local: model_doc/cvt

From bdec20e933dc88e7ad6158e45d39393ebc29504b Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:33:58 -0400
Subject: [PATCH 010/233] Update docs/source/en/index.mdx

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/index.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 58901fbc647f3..b4fa500e6fa66 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -68,7 +68,7 @@ The documentation is organized into five sections:
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.

From 37a4df72714bd0a16c47afed8e57f1e5e304d6e5 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:34:10 -0400
Subject: [PATCH 011/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 4291a6ecc4a94..722608a7c945e 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Facebook AI Research and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 72ca1fdb4e80785004f83699a1de04987fad6ccb Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:34:32 -0400
Subject: [PATCH 012/233] Update
 src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 ...t_conditional_detr_original_pytorch_checkpoint_to_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index f3c6fcadb2e2d..bcb2bbdda9a7d 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert CONDITIONAL_DETR checkpoints."""
+"""Convert Conditional DETR checkpoints."""
 
 
 import argparse

From 43fc3f262fe1ee5def2b89984cf4f6c9038c9408 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:37:15 -0400
Subject: [PATCH 013/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 722608a7c945e..278ed04c14a35 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" CONDITIONAL_DETR model configuration"""
+""" Conditional DETR model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping

From ff830499d0fa94264a785ace97ce74e9a6051cbd Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:36 -0400
Subject: [PATCH 014/233] Update docs/source/en/model_doc/conditional_detr.mdx

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/model_doc/conditional_detr.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/conditional_detr.mdx b/docs/source/en/model_doc/conditional_detr.mdx
index 51316a24e43ee..53129b77ea05a 100644
--- a/docs/source/en/model_doc/conditional_detr.mdx
+++ b/docs/source/en/model_doc/conditional_detr.mdx
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The conditional DETR model was proposed in [ConditionalDETR](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR.
+The Conditional DETR model was proposed in [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR.
 
 The abstract from the paper is the following:
 

From 808440a947cb23910ca68fafc77e5e15b5f8618c Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:42 -0400
Subject: [PATCH 015/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/configuration_conditional_detr.py      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 278ed04c14a35..e9485b37e5790 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -36,9 +36,9 @@
 class ConditionalDETRConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
-    a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
-    [DepuMeng/conditional_detr_resnet50](https://huggingface.co/DepuMeng/conditional_detr_resnet50) architecture.
+    a Conditional DETR model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Conditional DETR
+    [microsoft/conditional-detr-resnet-50](https://huggingface.co/microsoft/conditional-detr-resnet-50) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.

From 7238020c396cb5cab6e6d359cec3eef818629dfa Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:54 -0400
Subject: [PATCH 016/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index e9485b37e5790..009934ffa4235 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -116,7 +116,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     ```python
     >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
 
-    >>> # Initializing a CONDITIONAL_DETR DepuMeng/conditional_detr_resnet50 style configuration
+    >>> # Initializing a Conditional DETR microsoft/conditional-detr-resnet-50 style configuration
     >>> configuration = ConditionalDETRConfig()
 
     >>> # Initializing a model from the DepuMeng/conditional_detr_resnet50 style configuration

From 34393271b5ee672385b90578221cb588dd38ef63 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:59 -0400
Subject: [PATCH 017/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 009934ffa4235..d84b3f14e02a7 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -119,7 +119,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     >>> # Initializing a Conditional DETR microsoft/conditional-detr-resnet-50 style configuration
     >>> configuration = ConditionalDETRConfig()
 
-    >>> # Initializing a model from the DepuMeng/conditional_detr_resnet50 style configuration
+    >>> # Initializing a model from the microsoft/conditional-detr-resnet-50 style configuration
     >>> model = ConditionalDETRModel(configuration)
 
     >>> # Accessing the model configuration

From ef0fa8955ab9a1d5bfdda97b022b85081a20de82 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:48:44 -0400
Subject: [PATCH 018/233] fixed some doc issue

---
 docs/source/en/_toctree.yml                                 | 6 +++---
 src/transformers/models/auto/feature_extraction_auto.py     | 2 --
 .../conditional_detr/configuration_conditional_detr.py      | 4 ++--
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 8b32a281d3575..a8f43b73a9b5e 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -42,8 +42,7 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - isExpanded: false
-      sections:
+    - sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -58,7 +57,8 @@
         title: Summarization
       - local: tasks/multiple_choice
         title: Multiple choice
-      title: Task guides
+      title: Task guides      
+      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 6409674be7096..625b79db06494 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,8 +39,6 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
-        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
-        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 278ed04c14a35..2d958a0fbe0f5 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,8 +27,8 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DepuMeng/conditional_detr_resnet50": (
-        "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json"
+    "microsoft/conditional_detr_resnet50": (
+        "https://huggingface.co/microsoft/conditional_detr_resnet50/resolve/main/config.json"
     ),
 }
 

From 1225efa89de69b88a798681c1abb24433d66b1e8 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 11:19:56 -0400
Subject: [PATCH 019/233] changed prefix to ConditionalDetr

---
 README.md                                     |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   3 +-
 README_zh-hant.md                             |   3 +-
 docs/source/en/_toctree.yml                   |   4 +-
 docs/source/en/index.mdx                      |   1 +
 docs/source/en/model_doc/conditional_detr.mdx |  20 +-
 src/transformers/__init__.py                  |  24 +-
 .../models/auto/configuration_auto.py         |   2 +-
 src/transformers/models/auto/modeling_auto.py |   6 +-
 .../models/conditional_detr/__init__.py       |  28 +--
 .../configuration_conditional_detr.py         |  14 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |  14 +-
 .../feature_extraction_conditional_detr.py    |  26 +--
 .../modeling_conditional_detr.py              | 212 +++++++++---------
 .../utils/dummy_vision_objects.py             |   2 +-
 ...est_feature_extraction_conditional_detr.py |  16 +-
 .../test_modeling_conditional_detr.py         |  44 ++--
 utils/check_repo.py                           |   4 +-
 19 files changed, 215 insertions(+), 210 deletions(-)

diff --git a/README.md b/README.md
index 56747e897f451..994274fcfef22 100644
--- a/README.md
+++ b/README.md
@@ -277,6 +277,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_ko.md b/README_ko.md
index 56e2b8fcbea33..abc9b412acf72 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -228,6 +228,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 57007f837d261..57de2f0807515 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -252,7 +252,8 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang发布。
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 发布。
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index b0510ee109e2b..f107db8febbb3 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -264,7 +264,8 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index a8f43b73a9b5e..9f05f228765c5 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -57,7 +57,7 @@
         title: Summarization
       - local: tasks/multiple_choice
         title: Multiple choice
-      title: Task guides      
+      title: Task guides
       isExpanded: false
     title: Natural Language Processing
   - sections:
@@ -497,4 +497,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
+  title: API
\ No newline at end of file
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index b4fa500e6fa66..5564bbceb093f 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -69,6 +69,7 @@ The documentation is organized into five sections:
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/model_doc/conditional_detr.mdx b/docs/source/en/model_doc/conditional_detr.mdx
index 53129b77ea05a..d5846cbfee327 100644
--- a/docs/source/en/model_doc/conditional_detr.mdx
+++ b/docs/source/en/model_doc/conditional_detr.mdx
@@ -24,30 +24,30 @@ The abstract from the paper is the following:
 This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The original code can be found [here](https://github.com/Atten4Vis/ConditionalDETR).
 
 
-## ConditionalDETRConfig
+## ConditionalDetrConfig
 
-[[autodoc]] ConditionalDETRConfig
+[[autodoc]] ConditionalDetrConfig
 
-## ConditionalDETRFeatureExtractor
+## ConditionalDetrFeatureExtractor
 
-[[autodoc]] ConditionalDETRFeatureExtractor
+[[autodoc]] ConditionalDetrFeatureExtractor
     - __call__
     - pad_and_create_pixel_mask
     - post_process
     - post_process_segmentation
     - post_process_panoptic
 
-## ConditionalDETRModel
+## ConditionalDetrModel
 
-[[autodoc]] ConditionalDETRModel
+[[autodoc]] ConditionalDetrModel
     - forward
 
-## ConditionalDETRForObjectDetection
+## ConditionalDetrForObjectDetection
 
-[[autodoc]] ConditionalDETRForObjectDetection
+[[autodoc]] ConditionalDetrForObjectDetection
     - forward
 
-## ConditionalDETRForSegmentation
+## ConditionalDetrForSegmentation
 
-[[autodoc]] ConditionalDETRForSegmentation
+[[autodoc]] ConditionalDetrForSegmentation
     - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 03c6d930ea631..689941f8cbeec 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -171,7 +171,7 @@
         "CLIPVisionConfig",
     ],
     "models.codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenTokenizer"],
-    "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDETRConfig"],
+    "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDetrConfig"],
     "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
     "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
     "models.cpm": [],
@@ -653,7 +653,7 @@
     _import_structure["models.convnext"].append("ConvNextFeatureExtractor")
     _import_structure["models.deit"].append("DeiTFeatureExtractor")
     _import_structure["models.detr"].append("DetrFeatureExtractor")
-    _import_structure["models.conditional_detr"].append("ConditionalDETRFeatureExtractor")
+    _import_structure["models.conditional_detr"].append("ConditionalDetrFeatureExtractor")
     _import_structure["models.donut"].append("DonutFeatureExtractor")
     _import_structure["models.dpt"].append("DPTFeatureExtractor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaProcessor"])
@@ -697,10 +697,10 @@
     _import_structure["models.conditional_detr"].extend(
         [
             "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "ConditionalDETRForObjectDetection",
-            "ConditionalDETRForSegmentation",
-            "ConditionalDETRModel",
-            "ConditionalDETRPreTrainedModel",
+            "ConditionalDetrForObjectDetection",
+            "ConditionalDetrForSegmentation",
+            "ConditionalDetrModel",
+            "ConditionalDetrPreTrainedModel",
         ]
     )
 
@@ -3045,7 +3045,7 @@
         CLIPVisionConfig,
     )
     from .models.codegen import CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP, CodeGenConfig, CodeGenTokenizer
-    from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDETRConfig
+    from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDetrConfig
     from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
     from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
@@ -3465,7 +3465,7 @@
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
         from .models.clip import CLIPFeatureExtractor
-        from .models.conditional_detr import ConditionalDETRFeatureExtractor
+        from .models.conditional_detr import ConditionalDetrFeatureExtractor
         from .models.convnext import ConvNextFeatureExtractor
         from .models.deit import DeiTFeatureExtractor
         from .models.detr import DetrFeatureExtractor
@@ -3497,10 +3497,10 @@
     else:
         from .models.conditional_detr import (
             CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ConditionalDETRForObjectDetection,
-            ConditionalDETRForSegmentation,
-            ConditionalDETRModel,
-            ConditionalDETRPreTrainedModel,
+            ConditionalDetrForObjectDetection,
+            ConditionalDetrForSegmentation,
+            ConditionalDetrModel,
+            ConditionalDetrPreTrainedModel,
         )
         from .models.detr import (
             DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index dcd730e60836b..971c7d784d3d6 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -43,7 +43,7 @@
         ("canine", "CanineConfig"),
         ("clip", "CLIPConfig"),
         ("codegen", "CodeGenConfig"),
-        ("conditional_detr", "ConditionalDETRConfig"),
+        ("conditional_detr", "ConditionalDetrConfig"),
         ("convbert", "ConvBertConfig"),
         ("convnext", "ConvNextConfig"),
         ("ctrl", "CTRLConfig"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 4de5373a0c1c6..3465e4ddbce81 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -42,7 +42,7 @@
         ("canine", "CanineModel"),
         ("clip", "CLIPModel"),
         ("codegen", "CodeGenModel"),
-        ("conditional_detr", "ConditionalDETRModel"),
+        ("conditional_detr", "ConditionalDetrModel"),
         ("convbert", "ConvBertModel"),
         ("convnext", "ConvNextModel"),
         ("ctrl", "CTRLModel"),
@@ -366,7 +366,7 @@
     [
         # Do not add new models here, this class will be deprecated in the future.
         # Model for Image Segmentation mapping
-        ("conditional_detr", "ConditionalDETRForSegmentation"),
+        ("conditional_detr", "ConditionalDetrForSegmentation"),
         ("detr", "DetrForSegmentation"),
     ]
 )
@@ -448,7 +448,7 @@
 MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Object Detection mapping
-        ("conditional_detr", "ConditionalDETRForObjectDetection"),
+        ("conditional_detr", "ConditionalDetrForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
     ]
diff --git a/src/transformers/models/conditional_detr/__init__.py b/src/transformers/models/conditional_detr/__init__.py
index 1b72e7d7a3238..c2f1bdfdbbaae 100644
--- a/src/transformers/models/conditional_detr/__init__.py
+++ b/src/transformers/models/conditional_detr/__init__.py
@@ -24,8 +24,8 @@
 _import_structure = {
     "configuration_conditional_detr": [
         "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "ConditionalDETRConfig",
-        "ConditionalDETROnnxConfig",
+        "ConditionalDetrConfig",
+        "ConditionalDetrOnnxConfig",
     ]
 }
 
@@ -35,7 +35,7 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["feature_extraction_conditional_detr"] = ["ConditionalDETRFeatureExtractor"]
+    _import_structure["feature_extraction_conditional_detr"] = ["ConditionalDetrFeatureExtractor"]
 
 try:
     if not is_timm_available():
@@ -45,18 +45,18 @@
 else:
     _import_structure["modeling_conditional_detr"] = [
         "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "ConditionalDETRForObjectDetection",
-        "ConditionalDETRForSegmentation",
-        "ConditionalDETRModel",
-        "ConditionalDETRPreTrainedModel",
+        "ConditionalDetrForObjectDetection",
+        "ConditionalDetrForSegmentation",
+        "ConditionalDetrModel",
+        "ConditionalDetrPreTrainedModel",
     ]
 
 
 if TYPE_CHECKING:
     from .configuration_conditional_detr import (
         CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        ConditionalDETRConfig,
-        ConditionalDETROnnxConfig,
+        ConditionalDetrConfig,
+        ConditionalDetrOnnxConfig,
     )
 
     try:
@@ -65,7 +65,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .feature_extraction_conditional_detr import ConditionalDETRFeatureExtractor
+        from .feature_extraction_conditional_detr import ConditionalDetrFeatureExtractor
 
     try:
         if not is_timm_available():
@@ -75,10 +75,10 @@
     else:
         from .modeling_conditional_detr import (
             CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ConditionalDETRForObjectDetection,
-            ConditionalDETRForSegmentation,
-            ConditionalDETRModel,
-            ConditionalDETRPreTrainedModel,
+            ConditionalDetrForObjectDetection,
+            ConditionalDetrForSegmentation,
+            ConditionalDetrModel,
+            ConditionalDetrPreTrainedModel,
         )
 
 else:
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 4e0b4674356d9..6abcf85894735 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -33,9 +33,9 @@
 }
 
 
-class ConditionalDETRConfig(PretrainedConfig):
+class ConditionalDetrConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
+    This is the configuration class to store the configuration of a [`ConditionalDetrModel`]. It is used to instantiate
     a Conditional DETR model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Conditional DETR
     [microsoft/conditional-detr-resnet-50](https://huggingface.co/microsoft/conditional-detr-resnet-50) architecture.
@@ -48,7 +48,7 @@ class ConditionalDETRConfig(PretrainedConfig):
             The number of input channels.
         num_queries (`int`, *optional*, defaults to 100):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
-            [`ConditionalDETRModel`] can detect in a single image. For COCO, we recommend 100 queries.
+            [`ConditionalDetrModel`] can detect in a single image. For COCO, we recommend 100 queries.
         d_model (`int`, *optional*, defaults to 256):
             Dimension of the layers.
         encoder_layers (`int`, *optional*, defaults to 6):
@@ -114,13 +114,13 @@ class ConditionalDETRConfig(PretrainedConfig):
     Examples:
 
     ```python
-    >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
+    >>> from transformers import ConditionalDetrModel, ConditionalDetrConfig
 
     >>> # Initializing a Conditional DETR microsoft/conditional-detr-resnet-50 style configuration
-    >>> configuration = ConditionalDETRConfig()
+    >>> configuration = ConditionalDetrConfig()
 
     >>> # Initializing a model from the microsoft/conditional-detr-resnet-50 style configuration
-    >>> model = ConditionalDETRModel(configuration)
+    >>> model = ConditionalDetrModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
@@ -214,7 +214,7 @@ def hidden_size(self) -> int:
         return self.d_model
 
 
-class ConditionalDETROnnxConfig(OnnxConfig):
+class ConditionalDetrOnnxConfig(OnnxConfig):
 
     torch_onnx_minimum_version = version.parse("1.11")
 
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index bcb2bbdda9a7d..904530c44c227 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -26,10 +26,10 @@
 import requests
 from huggingface_hub import hf_hub_download
 from transformers import (
-    ConditionalDETRConfig,
-    ConditionalDETRFeatureExtractor,
-    ConditionalDETRForObjectDetection,
-    ConditionalDETRForSegmentation,
+    ConditionalDetrConfig,
+    ConditionalDetrFeatureExtractor,
+    ConditionalDetrForObjectDetection,
+    ConditionalDetrForSegmentation,
 )
 from transformers.utils import logging
 
@@ -226,7 +226,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
     """
 
     # load default config
-    config = ConditionalDETRConfig()
+    config = ConditionalDetrConfig()
     # set backbone and dilation attributes
     if "resnet101" in model_name:
         config.backbone = "resnet101"
@@ -246,7 +246,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
 
     # load feature extractor
     format = "coco_panoptic" if is_panoptic else "coco_detection"
-    feature_extractor = ConditionalDETRFeatureExtractor(format=format)
+    feature_extractor = ConditionalDetrFeatureExtractor(format=format)
 
     # prepare image
     img = prepare_img()
@@ -290,7 +290,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
                 val = state_dict.pop(key)
                 state_dict[prefix + key] = val
     # finally, create HuggingFace model and load state dict
-    model = ConditionalDETRForSegmentation(config) if is_panoptic else ConditionalDETRForObjectDetection(config)
+    model = ConditionalDetrForSegmentation(config) if is_panoptic else ConditionalDetrForObjectDetection(config)
     model.load_state_dict(state_dict)
     model.eval()
     model.push_to_hub(repo_id=model_name, organization="DepuMeng", commit_message="Add model")
diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index f37e3e1fb69bf..ec324f1f72456 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -119,7 +119,7 @@ def id_to_rgb(id_map):
     return color
 
 
-class ConditionalDETRFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
     Constructs a CONDITIONAL_DETR feature extractor.
 
@@ -431,11 +431,11 @@ def __call__(
             annotations (`Dict`, `List[Dict]`, *optional*):
                 The corresponding annotations in COCO format.
 
-                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_detection"`, the
+                In case [`ConditionalDetrFeatureExtractor`] was initialized with `format = "coco_detection"`, the
                 annotations for each image should have the following format: {'image_id': int, 'annotations':
                 [annotation]}, with the annotations being a list of COCO object annotations.
 
-                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the
+                In case [`ConditionalDetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the
                 annotations for each image should have the following format: {'image_id': int, 'file_name': str,
                 'segments_info': [segment_info]} with segments_info being a list of COCO panoptic annotations.
 
@@ -445,7 +445,7 @@ def __call__(
 
             masks_path (`pathlib.Path`, *optional*):
                 Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only
-                relevant in case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
+                relevant in case [`ConditionalDetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
 
             pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether or not to pad images up to the largest image in a batch and create a pixel mask.
@@ -676,11 +676,11 @@ def pad_and_create_pixel_mask(
     # inspired by https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py#L258
     def post_process(self, outputs, target_sizes):
         """
-        Converts the output of [`ConditionalDETRForObjectDetection`] into the format expected by the COCO api. Only
+        Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the COCO api. Only
         supports PyTorch.
 
         Args:
-            outputs ([`ConditionalDETRObjectDetectionOutput`]):
+            outputs ([`ConditionalDetrObjectDetectionOutput`]):
                 Raw outputs of the model.
             target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                 Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
@@ -717,11 +717,11 @@ def post_process(self, outputs, target_sizes):
 
     def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
         """
-        Converts the output of [`ConditionalDETRForSegmentation`] into image segmentation predictions. Only supports
+        Converts the output of [`ConditionalDetrForSegmentation`] into image segmentation predictions. Only supports
         PyTorch.
 
         Parameters:
-            outputs ([`ConditionalDETRSegmentationOutput`]):
+            outputs ([`ConditionalDetrSegmentationOutput`]):
                 Raw outputs of the model.
             target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
                 Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
@@ -760,14 +760,14 @@ def to_tuple(tup):
     # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
     def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
         """
-        Converts the output of [`ConditionalDETRForSegmentation`] into actual instance segmentation predictions. Only
+        Converts the output of [`ConditionalDetrForSegmentation`] into actual instance segmentation predictions. Only
         supports PyTorch.
 
         Args:
             results (`List[Dict]`):
-                Results list obtained by [`~ConditionalDETRFeatureExtractor.post_process`], to which "masks" results
+                Results list obtained by [`~ConditionalDetrFeatureExtractor.post_process`], to which "masks" results
                 will be added.
-            outputs ([`ConditionalDETRSegmentationOutput`]):
+            outputs ([`ConditionalDetrSegmentationOutput`]):
                 Raw outputs of the model.
             orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                 Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
@@ -804,11 +804,11 @@ def post_process_instance(self, results, outputs, orig_target_sizes, max_target_
     # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
     def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
         """
-        Converts the output of [`ConditionalDETRForSegmentation`] into actual panoptic predictions. Only supports
+        Converts the output of [`ConditionalDetrForSegmentation`] into actual panoptic predictions. Only supports
         PyTorch.
 
         Parameters:
-            outputs ([`ConditionalDETRSegmentationOutput`]):
+            outputs ([`ConditionalDetrSegmentationOutput`]):
                 Raw outputs of the model.
             processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
                 Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 54dea544f3da9..05343a89f3ed0 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -39,7 +39,7 @@
     replace_return_docstrings,
     requires_backends,
 )
-from .configuration_conditional_detr import ConditionalDETRConfig
+from .configuration_conditional_detr import ConditionalDetrConfig
 
 
 if is_scipy_available():
@@ -53,7 +53,7 @@
 
 logger = logging.get_logger(__name__)
 
-_CONFIG_FOR_DOC = "ConditionalDETRConfig"
+_CONFIG_FOR_DOC = "ConditionalDetrConfig"
 _CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
 
 CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -63,7 +63,7 @@
 
 
 @dataclass
-class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
+class ConditionalDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
     """
     Base class for outputs of the Conditional DETR decoder. This class adds one attribute to
     BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output
@@ -95,7 +95,7 @@ class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
 
 
 @dataclass
-class ConditionalDETRModelOutput(Seq2SeqModelOutput):
+class ConditionalDetrModelOutput(Seq2SeqModelOutput):
     """
     Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to
     Seq2SeqModelOutput, namely an optional stack of intermediate decoder activations, i.e. the output of each decoder
@@ -137,10 +137,10 @@ class ConditionalDETRModelOutput(Seq2SeqModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->ConditionalDETR
-class ConditionalDETRObjectDetectionOutput(ModelOutput):
+# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->ConditionalDetr
+class ConditionalDetrObjectDetectionOutput(ModelOutput):
     """
-    Output type of [`ConditionalDETRForObjectDetection`].
+    Output type of [`ConditionalDetrForObjectDetection`].
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
@@ -154,7 +154,7 @@ class ConditionalDETRObjectDetectionOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            possible padding). You can use [`~ConditionalDetrFeatureExtractor.post_process`] to retrieve the
             unnormalized bounding boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
@@ -201,10 +201,10 @@ class ConditionalDETRObjectDetectionOutput(ModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.detr.modeling_detr.DetrSegmentationOutput with Detr->ConditionalDETR
-class ConditionalDETRSegmentationOutput(ModelOutput):
+# Copied from transformers.models.detr.modeling_detr.DetrSegmentationOutput with Detr->ConditionalDetr
+class ConditionalDetrSegmentationOutput(ModelOutput):
     """
-    Output type of [`ConditionalDETRForSegmentation`].
+    Output type of [`ConditionalDetrForSegmentation`].
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
@@ -218,12 +218,12 @@ class ConditionalDETRSegmentationOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            possible padding). You can use [`~ConditionalDetrFeatureExtractor.post_process`] to retrieve the
             unnormalized bounding boxes.
         pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
             Segmentation masks logits for all queries. See also
-            [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
-            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation
+            [`~ConditionalDetrFeatureExtractor.post_process_segmentation`] or
+            [`~ConditionalDetrFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation
             masks respectively.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
@@ -272,8 +272,8 @@ class ConditionalDETRSegmentationOutput(ModelOutput):
 
 # BELOW: utilities copied from
 # https://github.com/facebookresearch/detr/blob/master/backbone.py
-# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDETR
-class ConditionalDETRFrozenBatchNorm2d(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDetr
+class ConditionalDetrFrozenBatchNorm2d(nn.Module):
     """
     BatchNorm2d where the batch statistics and the affine parameters are fixed.
 
@@ -282,7 +282,7 @@ class ConditionalDETRFrozenBatchNorm2d(nn.Module):
     """
 
     def __init__(self, n):
-        super(ConditionalDETRFrozenBatchNorm2d, self).__init__()
+        super(ConditionalDetrFrozenBatchNorm2d, self).__init__()
         self.register_buffer("weight", torch.ones(n))
         self.register_buffer("bias", torch.zeros(n))
         self.register_buffer("running_mean", torch.zeros(n))
@@ -295,7 +295,7 @@ def _load_from_state_dict(
         if num_batches_tracked_key in state_dict:
             del state_dict[num_batches_tracked_key]
 
-        super(ConditionalDETRFrozenBatchNorm2d, self)._load_from_state_dict(
+        super(ConditionalDetrFrozenBatchNorm2d, self)._load_from_state_dict(
             state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
         )
 
@@ -312,12 +312,12 @@ def forward(self, x):
         return x * scale + bias
 
 
-# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->ConditionalDETR
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->ConditionalDetr
 def replace_batch_norm(m, name=""):
     for attr_str in dir(m):
         target_attr = getattr(m, attr_str)
         if isinstance(target_attr, nn.BatchNorm2d):
-            frozen = ConditionalDETRFrozenBatchNorm2d(target_attr.num_features)
+            frozen = ConditionalDetrFrozenBatchNorm2d(target_attr.num_features)
             bn = getattr(m, attr_str)
             frozen.weight.data.copy_(bn.weight)
             frozen.bias.data.copy_(bn.bias)
@@ -328,11 +328,11 @@ def replace_batch_norm(m, name=""):
         replace_batch_norm(ch, n)
 
 
-class ConditionalDETRTimmConvEncoder(nn.Module):
+class ConditionalDetrTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.
 
-    nn.BatchNorm2d layers are replaced by ConditionalDETRFrozenBatchNorm2d as defined above.
+    nn.BatchNorm2d layers are replaced by ConditionalDetrFrozenBatchNorm2d as defined above.
 
     """
 
@@ -369,8 +369,8 @@ def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         return out
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->ConditionalDETR
-class ConditionalDETRConvModel(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->ConditionalDetr
+class ConditionalDetrConvModel(nn.Module):
     """
     This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
     """
@@ -391,7 +391,7 @@ def forward(self, pixel_values, pixel_mask):
         return out, pos
 
 
-# Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDETR
+# Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDetr
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
@@ -406,8 +406,8 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrSinePositionEmbedding with Detr->ConditionalDETR
-class ConditionalDETRSinePositionEmbedding(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrSinePositionEmbedding with Detr->ConditionalDetr
+class ConditionalDetrSinePositionEmbedding(nn.Module):
     """
     This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
     need paper, generalized to work on images.
@@ -444,8 +444,8 @@ def forward(self, pixel_values, pixel_mask):
         return pos
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->ConditionalDETR
-class ConditionalDETRLearnedPositionEmbedding(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->ConditionalDetr
+class ConditionalDetrLearnedPositionEmbedding(nn.Module):
     """
     This module learns positional embeddings up to a fixed maximum size.
     """
@@ -468,14 +468,14 @@ def forward(self, pixel_values, pixel_mask=None):
         return pos
 
 
-# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->ConditionalDETR
+# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->ConditionalDetr
 def build_position_encoding(config):
     n_steps = config.d_model // 2
     if config.position_embedding_type == "sine":
         # TODO find a better way of exposing other arguments
-        position_embedding = ConditionalDETRSinePositionEmbedding(n_steps, normalize=True)
+        position_embedding = ConditionalDetrSinePositionEmbedding(n_steps, normalize=True)
     elif config.position_embedding_type == "learned":
-        position_embedding = ConditionalDETRLearnedPositionEmbedding(n_steps)
+        position_embedding = ConditionalDetrLearnedPositionEmbedding(n_steps)
     else:
         raise ValueError(f"Not supported {config.position_embedding_type}")
 
@@ -634,7 +634,7 @@ def forward(
         return attn_output, attn_weights_reshaped
 
 
-class ConditionalDETRAttention(nn.Module):
+class ConditionalDetrAttention(nn.Module):
     """
     Cross-Attention used in Conditional DETR 'Conditional DETR for Fast Training Convergence' paper.
 
@@ -750,8 +750,8 @@ def forward(
         return attn_output, attn_weights_reshaped
 
 
-class ConditionalDETREncoderLayer(nn.Module):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrEncoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__()
         self.embed_dim = config.d_model
         self.self_attn = DetrAttention(
@@ -819,8 +819,8 @@ def forward(
         return outputs
 
 
-class ConditionalDETRDecoderLayer(nn.Module):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrDecoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__()
         self.embed_dim = config.d_model
 
@@ -832,7 +832,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.sa_kpos_proj = nn.Linear(d_model, d_model)
         self.sa_v_proj = nn.Linear(d_model, d_model)
 
-        self.self_attn = ConditionalDETRAttention(
+        self.self_attn = ConditionalDetrAttention(
             embed_dim=self.embed_dim,
             out_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
@@ -852,7 +852,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.ca_v_proj = nn.Linear(d_model, d_model)
         self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)
 
-        self.encoder_attn = ConditionalDETRAttention(
+        self.encoder_attn = ConditionalDetrAttention(
             self.embed_dim * 2, self.embed_dim, config.decoder_attention_heads, dropout=config.attention_dropout
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
@@ -989,8 +989,8 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead with Detr->ConditionalDETR
-class ConditionalDETRClassificationHead(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead with Detr->ConditionalDetr
+class ConditionalDetrClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
 
     def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
@@ -1023,9 +1023,9 @@ def forward(self, x):
         return x
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->ConditionalDETR
-class ConditionalDETRPreTrainedModel(PreTrainedModel):
-    config_class = ConditionalDETRConfig
+# Copied from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->ConditionalDetr
+class ConditionalDetrPreTrainedModel(PreTrainedModel):
+    config_class = ConditionalDetrConfig
     base_model_prefix = "model"
     main_input_name = "pixel_values"
 
@@ -1033,12 +1033,12 @@ def _init_weights(self, module):
         std = self.config.init_std
         xavier_std = self.config.init_xavier_std
 
-        if isinstance(module, ConditionalDETRMHAttentionMap):
+        if isinstance(module, ConditionalDetrMHAttentionMap):
             nn.init.zeros_(module.k_linear.bias)
             nn.init.zeros_(module.q_linear.bias)
             nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
             nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
-        elif isinstance(module, ConditionalDETRLearnedPositionEmbedding):
+        elif isinstance(module, ConditionalDetrLearnedPositionEmbedding):
             nn.init.uniform_(module.row_embeddings.weight)
             nn.init.uniform_(module.column_embeddings.weight)
         if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
@@ -1053,7 +1053,7 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, ConditionalDETRDecoder):
+        if isinstance(module, ConditionalDetrDecoder):
             module.gradient_checkpointing = value
 
 
@@ -1067,7 +1067,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     and behavior.
 
     Parameters:
-        config ([`ConditionalDETRConfig`]):
+        config ([`ConditionalDetrConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1078,8 +1078,8 @@ def _set_gradient_checkpointing(self, module, value=False):
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it.
 
-            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See
-            [`ConditionalDETRFeatureExtractor.__call__`] for details.
+            Pixel values can be obtained using [`ConditionalDetrFeatureExtractor`]. See
+            [`ConditionalDetrFeatureExtractor.__call__`] for details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -1112,11 +1112,11 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR
-class ConditionalDETREncoder(ConditionalDETRPreTrainedModel):
+# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDetr
+class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
-    [`ConditionalDETREncoderLayer`].
+    [`ConditionalDetrEncoderLayer`].
 
     The encoder updates the flattened feature map through multiple self-attention layers.
 
@@ -1125,16 +1125,16 @@ class ConditionalDETREncoder(ConditionalDETRPreTrainedModel):
     - position_embeddings are added to the forward pass.
 
     Args:
-        config: ConditionalDETRConfig
+        config: ConditionalDetrConfig
     """
 
-    def __init__(self, config: ConditionalDETRConfig):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         self.dropout = config.dropout
         self.layerdrop = config.encoder_layerdrop
 
-        self.layers = nn.ModuleList([ConditionalDETREncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layers = nn.ModuleList([ConditionalDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
 
         # in the original CONDITIONAL_DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
 
@@ -1222,9 +1222,9 @@ def forward(
         )
 
 
-class ConditionalDETRDecoder(ConditionalDETRPreTrainedModel):
+class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
     """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`ConditionalDETRDecoderLayer`].
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`ConditionalDetrDecoderLayer`].
 
     The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
 
@@ -1234,15 +1234,15 @@ class ConditionalDETRDecoder(ConditionalDETRPreTrainedModel):
     - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
 
     Args:
-        config: ConditionalDETRConfig
+        config: ConditionalDetrConfig
     """
 
-    def __init__(self, config: ConditionalDETRConfig):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
 
-        self.layers = nn.ModuleList([ConditionalDETRDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layers = nn.ModuleList([ConditionalDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
         # in CONDITIONAL_DETR, the decoder uses layernorm after the last decoder layer output
         self.layernorm = nn.LayerNorm(config.d_model)
         d_model = config.d_model
@@ -1425,7 +1425,7 @@ def custom_forward(*inputs):
                 ]
                 if v is not None
             )
-        return ConditionalDETRDecoderOutput(
+        return ConditionalDetrDecoderOutput(
             last_hidden_state=hidden_states,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
@@ -1442,22 +1442,22 @@ def custom_forward(*inputs):
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-class ConditionalDETRModel(ConditionalDETRPreTrainedModel):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrModel(ConditionalDetrPreTrainedModel):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # Create backbone + positional encoding
-        backbone = ConditionalDETRTimmConvEncoder(config.backbone, config.dilation)
+        backbone = ConditionalDetrTimmConvEncoder(config.backbone, config.dilation)
         position_embeddings = build_position_encoding(config)
-        self.backbone = ConditionalDETRConvModel(backbone, position_embeddings)
+        self.backbone = ConditionalDetrConvModel(backbone, position_embeddings)
 
         # Create projection layer
         self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
 
         self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
 
-        self.encoder = ConditionalDETREncoder(config)
-        self.decoder = ConditionalDETRDecoder(config)
+        self.encoder = ConditionalDetrEncoder(config)
+        self.decoder = ConditionalDetrDecoder(config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1477,7 +1477,7 @@ def unfreeze_backbone(self):
             param.requires_grad_(True)
 
     @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=ConditionalDETRModelOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=ConditionalDetrModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values,
@@ -1496,15 +1496,15 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRModel
+        >>> from transformers import ConditionalDetrFeatureExtractor, ConditionalDetrModel
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR")
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
@@ -1582,7 +1582,7 @@ def forward(
         if not return_dict:
             return decoder_outputs + encoder_outputs
 
-        return ConditionalDETRModelOutput(
+        return ConditionalDetrModelOutput(
             last_hidden_state=decoder_outputs.last_hidden_state,
             decoder_hidden_states=decoder_outputs.hidden_states,
             decoder_attentions=decoder_outputs.attentions,
@@ -1602,18 +1602,18 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-class ConditionalDETRForObjectDetection(ConditionalDETRPreTrainedModel):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # CONDITIONAL_DETR encoder-decoder model
-        self.model = ConditionalDETRModel(config)
+        self.model = ConditionalDetrModel(config)
 
         # Object detection heads
         self.class_labels_classifier = nn.Linear(
             config.d_model, config.num_labels
         )  # We add one for the "no object" class
-        self.bbox_predictor = ConditionalDETRMLPPredictionHead(
+        self.bbox_predictor = ConditionalDetrMLPPredictionHead(
             input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
         )
 
@@ -1629,7 +1629,7 @@ def _set_aux_loss(self, outputs_class, outputs_coord):
         return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
 
     @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=ConditionalDETRObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=ConditionalDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values,
@@ -1655,15 +1655,15 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForObjectDetection
+        >>> from transformers import ConditionalDetrFeatureExtractor, ConditionalDetrForObjectDetection
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1703,12 +1703,12 @@ def forward(
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
             # First: create the matcher
-            matcher = ConditionalDETRHungarianMatcher(
+            matcher = ConditionalDetrHungarianMatcher(
                 class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
             )
             # Second: create the criterion
             losses = ["labels", "boxes", "cardinality"]
-            criterion = ConditionalDETRLoss(
+            criterion = ConditionalDetrLoss(
                 matcher=matcher,
                 num_classes=self.config.num_labels,
                 focal_alpha=self.config.focal_alpha,
@@ -1751,7 +1751,7 @@ def forward(
                 output = (logits, pred_boxes) + outputs
             return ((loss, loss_dict) + output) if loss is not None else output
 
-        return ConditionalDETRObjectDetectionOutput(
+        return ConditionalDetrObjectDetectionOutput(
             loss=loss,
             loss_dict=loss_dict,
             logits=logits,
@@ -1775,22 +1775,22 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-class ConditionalDETRForSegmentation(ConditionalDETRPreTrainedModel):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrForSegmentation(ConditionalDetrPreTrainedModel):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # object detection model
-        self.conditional_detr = ConditionalDETRForObjectDetection(config)
+        self.conditional_detr = ConditionalDetrForObjectDetection(config)
 
         # segmentation head
         hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
         intermediate_channel_sizes = self.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes
 
-        self.mask_head = ConditionalDETRMaskHeadSmallConv(
+        self.mask_head = ConditionalDetrMaskHeadSmallConv(
             hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
         )
 
-        self.bbox_attention = ConditionalDETRMHAttentionMap(
+        self.bbox_attention = ConditionalDetrMHAttentionMap(
             hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
         )
 
@@ -1798,7 +1798,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.post_init()
 
     @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=ConditionalDETRSegmentationOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=ConditionalDetrSegmentationOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values,
@@ -1832,16 +1832,16 @@ def forward(
         >>> import torch
         >>> import numpy
 
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
+        >>> from transformers import ConditionalDetrFeatureExtractor, ConditionalDetrForSegmentation
         >>> from transformers.models.conditional_detr.feature_extraction_conditional_detr import rgb_to_id
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained(
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained(
         ...     "facebook/conditional_detr-resnet-50-panoptic"
         ... )
-        >>> model = ConditionalDETRForSegmentation.from_pretrained("facebook/conditional_detr-resnet-50-panoptic")
+        >>> model = ConditionalDetrForSegmentation.from_pretrained("facebook/conditional_detr-resnet-50-panoptic")
 
         >>> # prepare image for the model
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
@@ -1849,7 +1849,7 @@ def forward(
         >>> # forward pass
         >>> outputs = model(**inputs)
 
-        >>> # use the `post_process_panoptic` method of `ConditionalDETRFeatureExtractor` to convert to COCO format
+        >>> # use the `post_process_panoptic` method of `ConditionalDetrFeatureExtractor` to convert to COCO format
         >>> processed_sizes = torch.as_tensor(inputs["pixel_values"].shape[-2:]).unsqueeze(0)
         >>> result = feature_extractor.post_process_panoptic(outputs, processed_sizes)[0]
 
@@ -1947,12 +1947,12 @@ def forward(
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
             # First: create the matcher
-            matcher = ConditionalDETRHungarianMatcher(
+            matcher = ConditionalDetrHungarianMatcher(
                 class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
             )
             # Second: create the criterion
             losses = ["labels", "boxes", "cardinality", "masks"]
-            criterion = ConditionalDETRLoss(
+            criterion = ConditionalDetrLoss(
                 matcher=matcher,
                 num_classes=self.config.num_labels,
                 focal_alpha=self.config.focal_alpha,
@@ -1991,7 +1991,7 @@ def forward(
                 output = (logits, pred_boxes, pred_masks) + decoder_outputs + encoder_outputs
             return ((loss, loss_dict) + output) if loss is not None else output
 
-        return ConditionalDETRSegmentationOutput(
+        return ConditionalDetrSegmentationOutput(
             loss=loss,
             loss_dict=loss_dict,
             logits=logits,
@@ -2013,8 +2013,8 @@ def _expand(tensor, length: int):
 
 
 # taken from https://github.com/facebookresearch/detr/blob/master/models/segmentation.py
-# Copied from transformers.models.detr.modeling_detr.DetrMaskHeadSmallConv with Detr->ConditionalDETR
-class ConditionalDETRMaskHeadSmallConv(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrMaskHeadSmallConv with Detr->ConditionalDetr
+class ConditionalDetrMaskHeadSmallConv(nn.Module):
     """
     Simple convolutional head, using group norm. Upsampling is done using a FPN approach
     """
@@ -2094,8 +2094,8 @@ def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
         return x
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->ConditionalDETR
-class ConditionalDETRMHAttentionMap(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->ConditionalDetr
+class ConditionalDetrMHAttentionMap(nn.Module):
     """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
 
     def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
@@ -2173,16 +2173,16 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
 
 
 # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
-class ConditionalDETRLoss(nn.Module):
+class ConditionalDetrLoss(nn.Module):
     """
-    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process
+    This class computes the losses for ConditionalDetrForObjectDetection/ConditionalDetrForSegmentation. The process
     happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2)
     we supervise each pair of matched ground-truth / prediction (supervise class and box).
 
 
 
     Args:
-        matcher (`ConditionalDETRHungarianMatcher`):
+        matcher (`ConditionalDetrHungarianMatcher`):
             Module able to compute a matching between targets and proposals.
         num_classes (`int`):
             Number of object categories, omitting the special no-object category.
@@ -2375,7 +2375,7 @@ def forward(self, outputs, targets):
 
 
 # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
-class ConditionalDETRMLPPredictionHead(nn.Module):
+class ConditionalDetrMLPPredictionHead(nn.Module):
     """
     Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
     height and width of a bounding box w.r.t. an image.
@@ -2397,7 +2397,7 @@ def forward(self, x):
 
 
 # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/matcher.py
-class ConditionalDETRHungarianMatcher(nn.Module):
+class ConditionalDetrHungarianMatcher(nn.Module):
     """
     This class computes an assignment between the targets and the predictions of the network.
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index f61b8dfdda282..d2ec5be33ceb8 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -24,7 +24,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class ConditionalDETRFeatureExtractor(metaclass=DummyObject):
+class ConditionalDetrFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
index b5e19fe005da7..2d21cafc68520 100644
--- a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -32,10 +32,10 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ConditionalDETRFeatureExtractor
+    from transformers import ConditionalDetrFeatureExtractor
 
 
-class ConditionalDETRFeatureExtractionTester(unittest.TestCase):
+class ConditionalDetrFeatureExtractionTester(unittest.TestCase):
     def __init__(
         self,
         parent,
@@ -74,7 +74,7 @@ def prepare_feat_extract_dict(self):
 
     def get_expected_values(self, image_inputs, batched=False):
         """
-        This function computes the expected height and width when providing images to ConditionalDETRFeatureExtractor,
+        This function computes the expected height and width when providing images to ConditionalDetrFeatureExtractor,
         assuming do_resize is set to True with a scalar size.
         """
         if not batched:
@@ -106,12 +106,12 @@ def get_expected_values(self, image_inputs, batched=False):
 
 @require_torch
 @require_vision
-class ConditionalDETRFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+class ConditionalDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
 
-    feature_extraction_class = ConditionalDETRFeatureExtractor if is_vision_available() else None
+    feature_extraction_class = ConditionalDetrFeatureExtractor if is_vision_available() else None
 
     def setUp(self):
-        self.feature_extract_tester = ConditionalDETRFeatureExtractionTester(self)
+        self.feature_extract_tester = ConditionalDetrFeatureExtractionTester(self)
 
     @property
     def feat_extract_dict(self):
@@ -253,7 +253,7 @@ def test_call_pytorch_with_coco_detection_annotations(self):
         target = {"image_id": 39769, "annotations": target}
 
         # encode them
-        feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
         encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
 
         # verify pixel values
@@ -300,7 +300,7 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
 
         # encode them
         # TODO replace by .from_pretrained Atten4Vis/ConditionalDETR-panoptic
-        feature_extractor = ConditionalDETRFeatureExtractor(format="coco_panoptic")
+        feature_extractor = ConditionalDetrFeatureExtractor(format="coco_panoptic")
         encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
 
         # verify pixel values
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index 38933f26c6627..fd008b9af494f 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -19,7 +19,7 @@
 import math
 import unittest
 
-from transformers import ConditionalDETRConfig, is_timm_available, is_vision_available
+from transformers import ConditionalDetrConfig, is_timm_available, is_vision_available
 from transformers.testing_utils import require_timm, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
@@ -31,16 +31,16 @@
 if is_timm_available():
     import torch
 
-    from transformers import ConditionalDETRForObjectDetection, ConditionalDETRForSegmentation, ConditionalDETRModel
+    from transformers import ConditionalDetrForObjectDetection, ConditionalDetrForSegmentation, ConditionalDetrModel
 
 
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ConditionalDETRFeatureExtractor
+    from transformers import ConditionalDetrFeatureExtractor
 
 
-class ConditionalDETRModelTester:
+class ConditionalDetrModelTester:
     def __init__(
         self,
         parent,
@@ -105,7 +105,7 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
-        return ConditionalDETRConfig(
+        return ConditionalDetrConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
             decoder_layers=self.num_hidden_layers,
@@ -125,7 +125,7 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
     def create_and_check_conditional_detr_model(self, config, pixel_values, pixel_mask, labels):
-        model = ConditionalDETRModel(config=config)
+        model = ConditionalDetrModel(config=config)
         model.to(torch_device)
         model.eval()
 
@@ -137,7 +137,7 @@ def create_and_check_conditional_detr_model(self, config, pixel_values, pixel_ma
         )
 
     def create_and_check_conditional_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
-        model = ConditionalDETRForObjectDetection(config=config)
+        model = ConditionalDetrForObjectDetection(config=config)
         model.to(torch_device)
         model.eval()
 
@@ -155,12 +155,12 @@ def create_and_check_conditional_detr_object_detection_head_model(self, config,
 
 
 @require_timm
-class ConditionalDETRModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
-            ConditionalDETRModel,
-            ConditionalDETRForObjectDetection,
-            ConditionalDETRForSegmentation,
+            ConditionalDetrModel,
+            ConditionalDetrForObjectDetection,
+            ConditionalDetrForSegmentation,
         )
         if is_timm_available()
         else ()
@@ -176,7 +176,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class.__name__ in ["ConditionalDETRForObjectDetection", "ConditionalDETRForSegmentation"]:
+            if model_class.__name__ in ["ConditionalDetrForObjectDetection", "ConditionalDetrForSegmentation"]:
                 labels = []
                 for i in range(self.model_tester.batch_size):
                     target = {}
@@ -199,8 +199,8 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         return inputs_dict
 
     def setUp(self):
-        self.model_tester = ConditionalDETRModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ConditionalDETRConfig, has_text_modality=False)
+        self.model_tester = ConditionalDetrModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ConditionalDetrConfig, has_text_modality=False)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -279,10 +279,10 @@ def test_attention_outputs(self):
                 if "labels" in inputs_dict:
                     correct_outlen += 1  # loss is added to beginning
                 # Object Detection model returns pred_logits and pred_boxes
-                if model_class.__name__ == "ConditionalDETRForObjectDetection":
+                if model_class.__name__ == "ConditionalDetrForObjectDetection":
                     correct_outlen += 1
                 # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks
-                if model_class.__name__ == "ConditionalDETRForSegmentation":
+                if model_class.__name__ == "ConditionalDetrForSegmentation":
                     correct_outlen += 2
                 if "past_key_values" in outputs:
                     correct_outlen += 1  # past_key_values have been returned
@@ -406,7 +406,7 @@ def test_different_timm_backbone(self):
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
-            if model_class.__name__ == "ConditionalDETRForObjectDetection":
+            if model_class.__name__ == "ConditionalDetrForObjectDetection":
                 expected_shape = (
                     self.model_tester.batch_size,
                     self.model_tester.num_queries,
@@ -452,17 +452,17 @@ def prepare_img():
 @require_timm
 @require_vision
 @slow
-class ConditionalDETRModelIntegrationTests(unittest.TestCase):
+class ConditionalDetrModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
         return (
-            ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+            ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
             if is_vision_available()
             else None
         )
 
     def test_inference_no_head(self):
-        model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -479,7 +479,7 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_inference_object_detection_head(self):
-        model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -505,7 +505,7 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
     def test_inference_panoptic_segmentation_head(self):
-        model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
+        model = ConditionalDetrForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 8ee629afdce57..0d86129923b81 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -58,8 +58,8 @@
     "DetrEncoder",  # Building part of bigger (tested) model.
     "DetrDecoder",  # Building part of bigger (tested) model.
     "DetrDecoderWrapper",  # Building part of bigger (tested) model.
-    "ConditionalDETREncoder",  # Building part of bigger (tested) model.
-    "ConditionalDETRDecoder",  # Building part of bigger (tested) model.
+    "ConditionalDetrEncoder",  # Building part of bigger (tested) model.
+    "ConditionalDetrDecoder",  # Building part of bigger (tested) model.
     "M2M100Encoder",  # Building part of bigger (tested) model.
     "M2M100Decoder",  # Building part of bigger (tested) model.
     "MCTCTEncoder",  # Building part of bigger (tested) model.

From a303acf5bc318919bde10f7360e529f9f58b8761 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 11:23:00 -0400
Subject: [PATCH 020/233] fixed docs

---
 README.md                        | 1 -
 README_zh-hans.md                | 1 -
 README_zh-hant.md                | 1 -
 docs/source/en/index.mdx         | 3 +--
 docs/source/en/serialization.mdx | 2 +-
 5 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 994274fcfef22..56747e897f451 100644
--- a/README.md
+++ b/README.md
@@ -277,7 +277,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 57de2f0807515..7d51224255cbb 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -253,7 +253,6 @@ conda install -c huggingface transformers
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 发布。
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index f107db8febbb3..63c180a5a68da 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -265,7 +265,6 @@ conda install -c huggingface transformers
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 5564bbceb093f..29e92f030984e 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -69,7 +69,6 @@ The documentation is organized into five sections:
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
@@ -214,7 +213,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|      conditional_detr       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|      Conditional DETR       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index d7ff03e94d116..f484a11e04292 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -57,7 +57,7 @@ Ready-made configurations include the following architectures:
 - CamemBERT
 - CLIP
 - CodeGen
-- conditional_detr
+- Conditional DETR
 - ConvBERT
 - ConvNeXT
 - Data2VecText

From 437ed9adecff92122a6b79899cb1f35efeaab0f7 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 21:50:18 -0400
Subject: [PATCH 021/233] Update README_ko.md

---
 README_ko.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README_ko.md b/README_ko.md
index abc9b412acf72..68ca244f35c9d 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -229,7 +229,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.

From 519e49546a33ffd345facb90a75f64cfe15f6a43 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 12 Sep 2022 21:20:11 -0400
Subject: [PATCH 022/233] added spatial_model_name

---
 utils/check_copies.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/check_copies.py b/utils/check_copies.py
index 7d57173654468..0c8afa7b77ae8 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -477,6 +477,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "OpenAI GPT": "GPT",
     "Perceiver": "Perceiver IO",
     "ViT": "Vision Transformer (ViT)",
+    "conditional_detr": "Conditional DETR"
 }
 
 # Update this list with the models that shouldn't be in the README. This only concerns modular models or those who do

From a054c0d6466868a5378a5fe1835470fd628ad99c Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 12 Sep 2022 21:24:51 -0400
Subject: [PATCH 023/233] fixed fix-copies

---
 src/transformers/models/auto/configuration_auto.py | 2 +-
 utils/check_copies.py                              | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 971c7d784d3d6..4bbf365fb2d0b 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -296,7 +296,7 @@
         ("canine", "CANINE"),
         ("clip", "CLIP"),
         ("codegen", "CodeGen"),
-        ("conditional_detr", "conditional_detr"),
+        ("conditional_detr", "Conditional DETR"),
         ("convbert", "ConvBERT"),
         ("convnext", "ConvNeXT"),
         ("cpm", "CPM"),
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 0c8afa7b77ae8..7d57173654468 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -477,7 +477,6 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "OpenAI GPT": "GPT",
     "Perceiver": "Perceiver IO",
     "ViT": "Vision Transformer (ViT)",
-    "conditional_detr": "Conditional DETR"
 }
 
 # Update this list with the models that shouldn't be in the README. This only concerns modular models or those who do

From e93117c37345bbfe9275661ee00d29dbfda2c315 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:44:17 -0400
Subject: [PATCH 024/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index ec324f1f72456..f072310f7f878 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for CONDITIONAL_DETR."""
+"""Feature extractor class for Conditional DETR."""
 
 import io
 import pathlib

From f9ed25168378de102f5f2539136d1a825fba2ae2 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:44:58 -0400
Subject: [PATCH 025/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index f072310f7f878..bb000a7996b3c 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -37,7 +37,7 @@
 ImageInput = Union[Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"]]
 
 
-# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+# Copied from transformers.models.detr.feature_extraction_detr.center_to_corners_format
 def center_to_corners_format(x):
     """
     Converts a PyTorch tensor of bounding boxes of center format (center_x, center_y, width, height) to corners format

From 349033860223c7d0c87ccfa02017918011f80c8d Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:45:19 -0400
Subject: [PATCH 026/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index bb000a7996b3c..d89f3154e0bf0 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -48,6 +48,7 @@ def center_to_corners_format(x):
     return torch.stack(b, dim=-1)
 
 
+# Copied from transformers.models.detr.feature_extraction_detr.corners_to_center_format
 def corners_to_center_format(x):
     """
     Converts a NumPy array of bounding boxes of shape (number of bounding boxes, 4) of corners format (x_0, y_0, x_1,

From ec27a1895c538da976de4582b96e0b8ab90c159b Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:45:43 -0400
Subject: [PATCH 027/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index d89f3154e0bf0..d813ab2a45e3c 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -60,6 +60,7 @@ def corners_to_center_format(x):
     return np.stack(b, axis=-1)
 
 
+# Copied from transformers.models.detr.feature_extraction_detr.masks_to_boxes
 def masks_to_boxes(masks):
     """
     Compute the bounding boxes around the provided panoptic segmentation masks.

From 4e8d871cbdb70af178d9372462e316f75f9ccf87 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:46:05 -0400
Subject: [PATCH 028/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 05343a89f3ed0..3bf47082f6b30 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Attn4Vis and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 Microsoft Research Asia and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From d2f4fb02f427c4fd3f8bd526823d6ad0f85d93a9 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:46:25 -0400
Subject: [PATCH 029/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 3bf47082f6b30..7d45259b60adc 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -57,7 +57,7 @@
 _CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
 
 CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "Atten4Vis/ConditionalDETR",
+    "microsoft/conditional-detr-resnet-50",
     # See all conditional_detr models at https://huggingface.co/models?filter=conditional_detr
 ]
 

From 37d4ed17260c97a9e6921133c78b75a629f916d2 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:51:20 -0400
Subject: [PATCH 030/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 7d45259b60adc..bf4d82479375e 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -270,8 +270,6 @@ class ConditionalDetrSegmentationOutput(ModelOutput):
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-# BELOW: utilities copied from
-# https://github.com/facebookresearch/detr/blob/master/backbone.py
 # Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDetr
 class ConditionalDetrFrozenBatchNorm2d(nn.Module):
     """

From dcfe174f660a8bd7d0d70189aee345c9336552e9 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:52:46 -0400
Subject: [PATCH 031/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index bf4d82479375e..8550bba968802 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -326,6 +326,7 @@ def replace_batch_norm(m, name=""):
         replace_batch_norm(ch, n)
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrTimmConvEncoder
 class ConditionalDetrTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.

From 16605bb69cab160406769f8ef8f3fa0d2a2ed9e0 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:54:05 -0400
Subject: [PATCH 032/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 8550bba968802..415f0d47a41f5 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -503,6 +503,7 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrAttention with DETR->Conditional DETR
 class DetrAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.

From 63cb53973c63f6950d6b5221ad95eac6a54df609 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:56:48 -0400
Subject: [PATCH 033/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 415f0d47a41f5..be07eca130e23 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1112,7 +1112,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDetr
+# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL DETR,Detr->ConditionalDetr
 class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a

From 4426abff37eb7ff9af66ea16226831362ffea77f Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:57:04 -0400
Subject: [PATCH 034/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index be07eca130e23..00faac56fa1de 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1228,7 +1228,7 @@ class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
 
     The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
 
-    Some small tweaks for CONDITIONAL_DETR:
+    Some small tweaks for Conditional DETR:
 
     - position_embeddings and query_position_embeddings are added to the forward pass.
     - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.

From 4487f2be8d68177dbc308fd2ccb359e3e38bd862 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:57:19 -0400
Subject: [PATCH 035/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 00faac56fa1de..849502de9323a 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1243,7 +1243,7 @@ def __init__(self, config: ConditionalDetrConfig):
         self.layerdrop = config.decoder_layerdrop
 
         self.layers = nn.ModuleList([ConditionalDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
-        # in CONDITIONAL_DETR, the decoder uses layernorm after the last decoder layer output
+        # in Conditional DETR, the decoder uses layernorm after the last decoder layer output
         self.layernorm = nn.LayerNorm(config.d_model)
         d_model = config.d_model
         self.gradient_checkpointing = False

From 0e7048903c242448f011fa42706298dacf7c078a Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:57:42 -0400
Subject: [PATCH 036/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 849502de9323a..a063c1c2e46e4 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1437,7 +1437,7 @@ def custom_forward(*inputs):
 
 @add_start_docstrings(
     """
-    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+    The bare Conditional DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
     hidden-states without any specific head on top.
     """,
     CONDITIONAL_DETR_START_DOCSTRING,

From 01b20c9957d293b972fe35b540003bc367fa54f1 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 10:26:13 -0400
Subject: [PATCH 037/233] added some copied from

---
 .../feature_extraction_conditional_detr.py    | 29 +++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index d813ab2a45e3c..1421e82aa789a 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -94,9 +94,7 @@ def masks_to_boxes(masks):
     return np.stack([x_min, y_min, x_max, y_max], 1)
 
 
-# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
-# Copyright (c) 2018, Alexander Kirillov
-# All rights reserved.
+# Copied from transformers.models.detr.feature_extraction_detr.rgb_to_id
 def rgb_to_id(color):
     if isinstance(color, np.ndarray) and len(color.shape) == 3:
         if color.dtype == np.uint8:
@@ -104,7 +102,7 @@ def rgb_to_id(color):
         return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
-
+# Copied from transformers.models.detr.feature_extraction_detr.id_to_rgb
 def id_to_rgb(id_map):
     if isinstance(id_map, np.ndarray):
         id_map_copy = id_map.copy()
@@ -123,7 +121,7 @@ def id_to_rgb(id_map):
 
 class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
-    Constructs a CONDITIONAL_DETR feature extractor.
+    Constructs a CONDITIONAL DETR feature extractor.
 
     This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
     should refer to this superclass for more information regarding those methods.
@@ -188,7 +186,7 @@ def prepare(self, image, target, return_segmentation_masks=False, masks_path=Non
         else:
             raise ValueError(f"Format {self.format} not supported")
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.convert_coco_poly_to_mask
     def convert_coco_poly_to_mask(self, segmentations, height, width):
 
         try:
@@ -212,10 +210,10 @@ def convert_coco_poly_to_mask(self, segmentations, height, width):
 
         return masks
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_detection with DETR->Conditional DETR
     def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
         """
-        Convert the target in COCO format into the format expected by CONDITIONAL_DETR.
+        Convert the target in COCO format into the format expected by CONDITIONAL DETR.
         """
         w, h = image.size
 
@@ -277,6 +275,7 @@ def prepare_coco_detection(self, image, target, return_segmentation_masks=False)
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_panoptic
     def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
         w, h = image.size
         ann_info = target.copy()
@@ -310,6 +309,7 @@ def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
     def _resize(self, image, size, target=None, max_size=None):
         """
         Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
@@ -380,6 +380,7 @@ def get_size(image_size, size, max_size=None):
 
         return rescaled_image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
     def _normalize(self, image, mean, std, target=None):
         """
         Normalize the image with a certain mean and std.
@@ -401,7 +402,8 @@ def _normalize(self, image, mean, std, target=None):
             target["boxes"] = boxes
 
         return image, target
-
+    
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._call with DETR->Conditional DETR,Detr->ConditionalDetr
     def __call__(
         self,
         images: ImageInput,
@@ -549,7 +551,7 @@ def __call__(
             if annotations is not None:
                 annotations = [annotations]
 
-        # prepare (COCO annotations as a list of Dict -> CONDITIONAL_DETR target as a single Dict per image)
+        # prepare (COCO annotations as a list of Dict -> CONDITIONAL DETR target as a single Dict per image)
         if annotations is not None:
             for idx, (image, target) in enumerate(zip(images, annotations)):
                 if not isinstance(image, Image.Image):
@@ -624,6 +626,7 @@ def __call__(
 
         return encoded_inputs
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
     def _max_by_axis(self, the_list):
         # type: (List[List[int]]) -> List[int]
         maxes = the_list[0]
@@ -632,6 +635,7 @@ def _max_by_axis(self, the_list):
                 maxes[index] = max(maxes[index], item)
         return maxes
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.pad_and_create_pixel_mask
     def pad_and_create_pixel_mask(
         self, pixel_values_list: List["torch.Tensor"], return_tensors: Optional[Union[str, TensorType]] = None
     ):
@@ -717,6 +721,7 @@ def post_process(self, outputs, target_sizes):
 
         return results
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_segmentation with Detr->ConditionalDetr
     def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
         """
         Converts the output of [`ConditionalDetrForSegmentation`] into image segmentation predictions. Only supports
@@ -759,7 +764,7 @@ def to_tuple(tup):
             preds.append(predictions)
         return preds
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_instance with Detr->ConditionalDetr
     def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
         """
         Converts the output of [`ConditionalDetrForSegmentation`] into actual instance segmentation predictions. Only
@@ -803,7 +808,7 @@ def post_process_instance(self, results, outputs, orig_target_sizes, max_target_
 
         return results
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_panoptic with Detr->ConditionalDetr
     def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
         """
         Converts the output of [`ConditionalDetrForSegmentation`] into actual panoptic predictions. Only supports

From 801f3c9fbf93f2c05bfb9a893d20c839f5f1ba0e Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 10:27:39 -0400
Subject: [PATCH 038/233] added some copied from

---
 .../conditional_detr/feature_extraction_conditional_detr.py   | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 1421e82aa789a..812a0fe103cea 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -309,7 +309,6 @@ def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
 
         return image, target
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
     def _resize(self, image, size, target=None, max_size=None):
         """
         Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
@@ -380,7 +379,6 @@ def get_size(image_size, size, max_size=None):
 
         return rescaled_image, target
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
     def _normalize(self, image, mean, std, target=None):
         """
         Normalize the image with a certain mean and std.
@@ -403,7 +401,6 @@ def _normalize(self, image, mean, std, target=None):
 
         return image, target
     
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._call with DETR->Conditional DETR,Detr->ConditionalDetr
     def __call__(
         self,
         images: ImageInput,
@@ -626,7 +623,6 @@ def __call__(
 
         return encoded_inputs
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
     def _max_by_axis(self, the_list):
         # type: (List[List[int]]) -> List[int]
         maxes = the_list[0]

From 203d1df042f0921ebb203282439b99a50c48caf1 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 11:40:45 -0400
Subject: [PATCH 039/233] added some copied from

---
 .../feature_extraction_conditional_detr.py    |  3 +-
 .../modeling_conditional_detr.py              | 52 +++++++++++--------
 ...est_feature_extraction_conditional_detr.py |  4 +-
 .../test_modeling_conditional_detr.py         |  8 +--
 4 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 812a0fe103cea..c9f5d8124e539 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -210,10 +210,9 @@ def convert_coco_poly_to_mask(self, segmentations, height, width):
 
         return masks
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_detection with DETR->Conditional DETR
     def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
         """
-        Convert the target in COCO format into the format expected by CONDITIONAL DETR.
+        Convert the target in COCO format into the format expected by Conditional DETR.
         """
         w, h = image.size
 
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index a063c1c2e46e4..435a80b220b3b 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -54,11 +54,11 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "ConditionalDetrConfig"
-_CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
+_CHECKPOINT_FOR_DOC = "microsoft/conditional-detr-resnet-50"
 
 CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "microsoft/conditional-detr-resnet-50",
-    # See all conditional_detr models at https://huggingface.co/models?filter=conditional_detr
+    # See all Conditional DETR models at https://huggingface.co/models?filter=conditional_detr
 ]
 
 
@@ -331,11 +331,11 @@ class ConditionalDetrTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.
 
-    nn.BatchNorm2d layers are replaced by ConditionalDetrFrozenBatchNorm2d as defined above.
+    nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.
 
     """
 
-    def __init__(self, name: str, dilation: bool):
+    def __init__(self, name: str, dilation: bool, use_pretrained_backbone: bool, num_channels: int = 3):
         super().__init__()
 
         kwargs = {}
@@ -344,7 +344,14 @@ def __init__(self, name: str, dilation: bool):
 
         requires_backends(self, ["timm"])
 
-        backbone = create_model(name, pretrained=True, features_only=True, out_indices=(1, 2, 3, 4), **kwargs)
+        backbone = create_model(
+            name,
+            pretrained=use_pretrained_backbone,
+            features_only=True,
+            out_indices=(1, 2, 3, 4),
+            in_chans=num_channels,
+            **kwargs,
+        )
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
@@ -482,7 +489,7 @@ def build_position_encoding(config):
 
 
 # function to generate sine positional embedding for 2d coordinates
-def gen_sineembed_for_position(pos_tensor):
+def gen_sine_position_embeddings(pos_tensor):
     scale = 2 * math.pi
     dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
     dim_t = 10000 ** (2 * (dim_t // 2) / 128)
@@ -503,12 +510,11 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrAttention with DETR->Conditional DETR
 class DetrAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
 
-    Here, we add position embeddings to the queries and keys (as explained in the CONDITIONAL_DETR paper).
+    Here, we add position embeddings to the queries and keys (as explained in the Conditional DETR paper).
     """
 
     def __init__(
@@ -1112,7 +1118,6 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL DETR,Detr->ConditionalDetr
 class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
@@ -1120,7 +1125,7 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
 
     The encoder updates the flattened feature map through multiple self-attention layers.
 
-    Small tweak for CONDITIONAL_DETR:
+    Small tweak for Conditional DETR:
 
     - position_embeddings are added to the forward pass.
 
@@ -1136,7 +1141,7 @@ def __init__(self, config: ConditionalDetrConfig):
 
         self.layers = nn.ModuleList([ConditionalDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
 
-        # in the original CONDITIONAL_DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
+        # in the original Conditional DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1341,7 +1346,7 @@ def forward(
         reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
         obj_center = reference_points[..., :2].transpose(0, 1)
         # get sine embedding for the query vector
-        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center)
+        query_sine_embed_before_transformation = gen_sine_position_embeddings(obj_center)
 
         for idx, decoder_layer in enumerate(self.layers):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1503,8 +1508,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
+        >>> model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50")
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
@@ -1606,7 +1611,7 @@ class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
     def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
-        # CONDITIONAL_DETR encoder-decoder model
+        # CONDITIONAL DETR encoder-decoder model
         self.model = ConditionalDetrModel(config)
 
         # Object detection heads
@@ -1662,8 +1667,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
+        >>> model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -2374,7 +2379,7 @@ def forward(self, outputs, targets):
         return losses
 
 
-# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->ConditionalDetr
 class ConditionalDetrMLPPredictionHead(nn.Module):
     """
     Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
@@ -2396,7 +2401,6 @@ def forward(self, x):
         return x
 
 
-# taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/matcher.py
 class ConditionalDetrHungarianMatcher(nn.Module):
     """
     This class computes an assignment between the targets and the predictions of the network.
@@ -2421,7 +2425,7 @@ def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float
         self.class_cost = class_cost
         self.bbox_cost = bbox_cost
         self.giou_cost = giou_cost
-        if class_cost == 0 or bbox_cost == 0 or giou_cost == 0:
+        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
             raise ValueError("All costs of the Matcher can't be 0")
 
     @torch.no_grad()
@@ -2488,6 +2492,7 @@ def _upcast(t: Tensor) -> Tensor:
         return t if t.dtype in (torch.int32, torch.int64) else t.int()
 
 
+# Copied from transformers.models.detr.modeling_detr.box_area
 def box_area(boxes: Tensor) -> Tensor:
     """
     Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
@@ -2504,7 +2509,7 @@ def box_area(boxes: Tensor) -> Tensor:
     return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
 
 
-# modified from torchvision to also return the union
+# Copied from transformers.models.detr.modeling_detr.box_iou
 def box_iou(boxes1, boxes2):
     area1 = box_area(boxes1)
     area2 = box_area(boxes2)
@@ -2521,6 +2526,7 @@ def box_iou(boxes1, boxes2):
     return iou, union
 
 
+# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
 def generalized_box_iou(boxes1, boxes2):
     """
     Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
@@ -2554,7 +2560,7 @@ def _max_by_axis(the_list):
             maxes[index] = max(maxes[index], item)
     return maxes
 
-
+# Copied from transformers.models.detr.modeling_detr.NestedTensor
 class NestedTensor(object):
     def __init__(self, tensors, mask: Optional[Tensor]):
         self.tensors = tensors
@@ -2575,7 +2581,7 @@ def decompose(self):
     def __repr__(self):
         return str(self.tensors)
 
-
+# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
 def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     if tensor_list[0].ndim == 3:
         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
index 2d21cafc68520..371d97de0000b 100644
--- a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -253,7 +253,7 @@ def test_call_pytorch_with_coco_detection_annotations(self):
         target = {"image_id": 39769, "annotations": target}
 
         # encode them
-        feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
         encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
 
         # verify pixel values
@@ -299,7 +299,7 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
         masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
 
         # encode them
-        # TODO replace by .from_pretrained Atten4Vis/ConditionalDETR-panoptic
+        # TODO replace by .from_pretrained microsoft/conditional-detr-resnet-50-panoptic
         feature_extractor = ConditionalDetrFeatureExtractor(format="coco_panoptic")
         encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
 
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index fd008b9af494f..2fb4bb562d5bf 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -456,13 +456,13 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
         return (
-            ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+            ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
             if is_vision_available()
             else None
         )
 
     def test_inference_no_head(self):
-        model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -479,7 +479,7 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_inference_object_detection_head(self):
-        model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -505,7 +505,7 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
     def test_inference_panoptic_segmentation_head(self):
-        model = ConditionalDetrForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
+        model = ConditionalDetrForSegmentation.from_pretrained("microsoft/conditional-detr-resnet-50-panoptic").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()

From ae30a7863df3f53ca5073493c7c8d4a30fa6bc8f Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 11:43:10 -0400
Subject: [PATCH 040/233] added some copied from

---
 .../feature_extraction_conditional_detr.py                | 3 ++-
 .../models/conditional_detr/modeling_conditional_detr.py  | 2 ++
 .../conditional_detr/test_modeling_conditional_detr.py    | 8 ++++++--
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index c9f5d8124e539..922cfaf09e98e 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -102,6 +102,7 @@ def rgb_to_id(color):
         return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
+
 # Copied from transformers.models.detr.feature_extraction_detr.id_to_rgb
 def id_to_rgb(id_map):
     if isinstance(id_map, np.ndarray):
@@ -399,7 +400,7 @@ def _normalize(self, image, mean, std, target=None):
             target["boxes"] = boxes
 
         return image, target
-    
+
     def __call__(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 435a80b220b3b..a24a306b65e9f 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -2560,6 +2560,7 @@ def _max_by_axis(the_list):
             maxes[index] = max(maxes[index], item)
     return maxes
 
+
 # Copied from transformers.models.detr.modeling_detr.NestedTensor
 class NestedTensor(object):
     def __init__(self, tensors, mask: Optional[Tensor]):
@@ -2581,6 +2582,7 @@ def decompose(self):
     def __repr__(self):
         return str(self.tensors)
 
+
 # Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
 def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     if tensor_list[0].ndim == 3:
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index 2fb4bb562d5bf..a67f0bc30638e 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -479,7 +479,9 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_inference_object_detection_head(self):
-        model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)
+        model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50").to(
+            torch_device
+        )
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -505,7 +507,9 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
     def test_inference_panoptic_segmentation_head(self):
-        model = ConditionalDetrForSegmentation.from_pretrained("microsoft/conditional-detr-resnet-50-panoptic").to(torch_device)
+        model = ConditionalDetrForSegmentation.from_pretrained("microsoft/conditional-detr-resnet-50-panoptic").to(
+            torch_device
+        )
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()

From 67424e6e6d705452312c7baf9dab5885c655d5a1 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 14:52:49 -0400
Subject: [PATCH 041/233] fixed use_pretrained issue

---
 .../models/conditional_detr/configuration_conditional_detr.py | 4 ++++
 .../models/conditional_detr/modeling_conditional_detr.py      | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 6abcf85894735..df6673e9ebfef 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -134,6 +134,7 @@ class ConditionalDetrConfig(PretrainedConfig):
 
     def __init__(
         self,
+        num_channels=3,
         num_queries=300,
         max_position_embeddings=1024,
         encoder_layers=6,
@@ -157,6 +158,7 @@ def __init__(
         auxiliary_loss=False,
         position_embedding_type="sine",
         backbone="resnet50",
+        use_pretrained_backbone=True,
         dilation=False,
         class_cost=2,
         bbox_cost=5,
@@ -169,6 +171,7 @@ def __init__(
         focal_alpha=0.25,
         **kwargs
     ):
+        self.num_channels = num_channels
         self.num_queries = num_queries
         self.max_position_embeddings = max_position_embeddings
         self.d_model = d_model
@@ -191,6 +194,7 @@ def __init__(
         self.auxiliary_loss = auxiliary_loss
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
         self.dilation = dilation
         # Hungarian matcher
         self.class_cost = class_cost
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index a24a306b65e9f..9e70e1b873cd6 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1452,7 +1452,9 @@ def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # Create backbone + positional encoding
-        backbone = ConditionalDetrTimmConvEncoder(config.backbone, config.dilation)
+        backbone = ConditionalDetrTimmConvEncoder(
+            config.backbone, config.dilation, config.use_pretrained_backbone, config.num_channels
+        )
         position_embeddings = build_position_encoding(config)
         self.backbone = ConditionalDetrConvModel(backbone, position_embeddings)
 

From 908141d50cd18cdb04f1a07afb00a29812a533ba Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 16 Sep 2022 10:35:52 -0400
Subject: [PATCH 042/233] changed post-process

---
 .../conditional_detr/feature_extraction_conditional_detr.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 922cfaf09e98e..be85334d7b354 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -701,7 +701,7 @@ def post_process(self, outputs, target_sizes):
             raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
 
         prob = out_logits.sigmoid()
-        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 300, dim=1)
         scores = topk_values
         topk_boxes = topk_indexes // out_logits.shape[2]
         labels = topk_indexes % out_logits.shape[2]

From 256145fc6ba75d9c217075bc18eed0dd88c8e0dd Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:47:29 -0400
Subject: [PATCH 043/233] added conditional_detr files

---
 README.md                                     |    1 +
 README_ko.md                                  |    1 +
 README_zh-hans.md                             |    1 +
 README_zh-hant.md                             |    1 +
 docs/source/en/index.mdx                      |    2 +
 docs/source/en/model_doc/conditional_detr.mdx |   53 +
 docs/source/en/serialization.mdx              |    1 +
 src/transformers/__init__.py                  |   20 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 .../models/auto/feature_extraction_auto.py    |    2 +
 src/transformers/models/auto/modeling_auto.py |    3 +
 .../models/conditional_detr/__init__.py       |   87 +
 .../configuration_conditional_detr.py         |  242 ++
 ..._original_pytorch_checkpoint_to_pytorch.py |  298 ++
 .../feature_extraction_conditional_detr.py    |  938 ++++++
 .../modeling_conditional_detr.py              | 2547 +++++++++++++++++
 .../utils/dummy_vision_objects.py             |    7 +
 tests/models/conditional_detr/__init__.py     |    0
 ...est_feature_extraction_conditional_detr.py |  338 +++
 .../test_modeling_conditional_detr.py         |  538 ++++
 21 files changed, 5084 insertions(+)
 create mode 100644 docs/source/en/model_doc/conditional_detr.mdx
 create mode 100644 src/transformers/models/conditional_detr/__init__.py
 create mode 100644 src/transformers/models/conditional_detr/configuration_conditional_detr.py
 create mode 100644 src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
 create mode 100644 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
 create mode 100644 src/transformers/models/conditional_detr/modeling_conditional_detr.py
 create mode 100644 tests/models/conditional_detr/__init__.py
 create mode 100644 tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
 create mode 100644 tests/models/conditional_detr/test_modeling_conditional_detr.py

diff --git a/README.md b/README.md
index 6071d7885d021..986631466858f 100644
--- a/README.md
+++ b/README.md
@@ -276,6 +276,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_ko.md b/README_ko.md
index 2a42664e20d77..ee2d52cf5ac80 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -228,6 +228,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 7d0642cc9abff..e0897cfbb012b 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -252,6 +252,7 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang发布。
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index c4de5181002d7..b0536426a45e5 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -264,6 +264,7 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index f118359bc57be..7830c57e6eabd 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -68,6 +68,7 @@ The documentation is organized into five sections:
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
@@ -215,6 +216,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|      conditional_detr       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/en/model_doc/conditional_detr.mdx b/docs/source/en/model_doc/conditional_detr.mdx
new file mode 100644
index 0000000000000..51316a24e43ee
--- /dev/null
+++ b/docs/source/en/model_doc/conditional_detr.mdx
@@ -0,0 +1,53 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Conditional DETR
+
+## Overview
+
+The conditional DETR model was proposed in [ConditionalDETR](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR.
+
+The abstract from the paper is the following:
+
+*The recently-developed DETR approach applies the transformer encoder and decoder architecture to object detection and achieves promising performance. In this paper, we handle the critical issue, slow training convergence, and present a conditional cross-attention mechanism for fast DETR training. Our approach is motivated by that the cross-attention in DETR relies highly on the content embeddings for localizing the four extremities and predicting the box, which increases the need for high-quality content embeddings and thus the training difficulty. Our approach, named conditional DETR, learns a conditional spatial query from the decoder embedding for decoder multi-head cross-attention. The benefit is that through the conditional spatial query, each cross-attention head is able to attend to a band containing a distinct region, e.g., one object extremity or a region inside the object box. This narrows down the spatial range for localizing the distinct regions for object classification and box regression, thus relaxing the dependence on the content embeddings and easing the training. Empirical results show that conditional DETR converges 6.7× faster for the backbones R50 and R101 and 10× faster for stronger backbones DC5-R50 and DC5-R101. Code is available at https://github.com/Atten4Vis/ConditionalDETR.*
+
+
+This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The original code can be found [here](https://github.com/Atten4Vis/ConditionalDETR).
+
+
+## ConditionalDETRConfig
+
+[[autodoc]] ConditionalDETRConfig
+
+## ConditionalDETRFeatureExtractor
+
+[[autodoc]] ConditionalDETRFeatureExtractor
+    - __call__
+    - pad_and_create_pixel_mask
+    - post_process
+    - post_process_segmentation
+    - post_process_panoptic
+
+## ConditionalDETRModel
+
+[[autodoc]] ConditionalDETRModel
+    - forward
+
+## ConditionalDETRForObjectDetection
+
+[[autodoc]] ConditionalDETRForObjectDetection
+    - forward
+
+## ConditionalDETRForSegmentation
+
+[[autodoc]] ConditionalDETRForSegmentation
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 74f50c78513ce..f6304004b62ae 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -57,6 +57,7 @@ Ready-made configurations include the following architectures:
 - CamemBERT
 - CLIP
 - CodeGen
+- conditional_detr
 - ConvBERT
 - ConvNeXT
 - Data2VecText
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3c3a3a5006416..9da049462d860 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -172,6 +172,7 @@
         "CLIPVisionConfig",
     ],
     "models.codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenTokenizer"],
+    "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDETRConfig"],
     "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
     "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
     "models.cpm": [],
@@ -660,6 +661,7 @@
     _import_structure["models.convnext"].append("ConvNextFeatureExtractor")
     _import_structure["models.deit"].append("DeiTFeatureExtractor")
     _import_structure["models.detr"].append("DetrFeatureExtractor")
+    _import_structure["models.conditional_detr"].append("ConditionalDETRFeatureExtractor")
     _import_structure["models.donut"].append("DonutFeatureExtractor")
     _import_structure["models.dpt"].append("DPTFeatureExtractor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaProcessor"])
@@ -708,6 +710,15 @@
             "DetrPreTrainedModel",
         ]
     )
+    _import_structure["models.conditional_detr"].extend(
+        [
+            "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ConditionalDETRForObjectDetection",
+            "ConditionalDETRForSegmentation",
+            "ConditionalDETRModel",
+            "ConditionalDETRPreTrainedModel",
+        ]
+    )
 
 try:
     if not is_scatter_available():
@@ -3075,6 +3086,7 @@
         CLIPVisionConfig,
     )
     from .models.codegen import CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP, CodeGenConfig, CodeGenTokenizer
+    from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDETRConfig
     from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
     from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
@@ -3498,6 +3510,7 @@
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
         from .models.clip import CLIPFeatureExtractor
+        from .models.conditional_detr import ConditionalDETRFeatureExtractor
         from .models.convnext import ConvNextFeatureExtractor
         from .models.deit import DeiTFeatureExtractor
         from .models.detr import DetrFeatureExtractor
@@ -3533,6 +3546,13 @@
             DeformableDetrModel,
             DeformableDetrPreTrainedModel,
         )
+        from .models.conditional_detr import (
+            CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ConditionalDETRForObjectDetection,
+            ConditionalDETRForSegmentation,
+            ConditionalDETRModel,
+            ConditionalDETRPreTrainedModel,
+        )
         from .models.detr import (
             DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
             DetrForObjectDetection,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index fbdbfd579cb9e..8a6622a9f35b6 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -38,6 +38,7 @@
     canine,
     clip,
     codegen,
+    conditional_detr,
     convbert,
     convnext,
     cpm,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 1204e6608a768..b28bb0665bc3d 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -43,6 +43,7 @@
         ("canine", "CanineConfig"),
         ("clip", "CLIPConfig"),
         ("codegen", "CodeGenConfig"),
+        ("conditional_detr", "ConditionalDETRConfig"),
         ("convbert", "ConvBertConfig"),
         ("convnext", "ConvNextConfig"),
         ("ctrl", "CTRLConfig"),
@@ -175,6 +176,7 @@
         ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("clip", "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("codegen", "CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("conditional_detr", "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convbert", "CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convnext", "CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ctrl", "CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -300,6 +302,7 @@
         ("canine", "CANINE"),
         ("clip", "CLIP"),
         ("codegen", "CodeGen"),
+        ("conditional_detr", "conditional_detr"),
         ("convbert", "ConvBERT"),
         ("convnext", "ConvNeXT"),
         ("cpm", "CPM"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 015fd132ef0dc..9ba8295b406da 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,6 +39,8 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
+        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
+        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 7f4968d03cdf6..025db1fa81cbe 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -42,6 +42,7 @@
         ("canine", "CanineModel"),
         ("clip", "CLIPModel"),
         ("codegen", "CodeGenModel"),
+        ("conditional_detr", "ConditionalDETRModel"),
         ("convbert", "ConvBertModel"),
         ("convnext", "ConvNextModel"),
         ("ctrl", "CTRLModel"),
@@ -373,6 +374,7 @@
     [
         # Do not add new models here, this class will be deprecated in the future.
         # Model for Image Segmentation mapping
+        ("conditional_detr", "ConditionalDETRForSegmentation"),
         ("detr", "DetrForSegmentation"),
     ]
 )
@@ -456,6 +458,7 @@
     [
         # Model for Object Detection mapping
         ("deformable_detr", "DeformableDetrForObjectDetection"),
+        ("conditional_detr", "ConditionalDETRForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
     ]
diff --git a/src/transformers/models/conditional_detr/__init__.py b/src/transformers/models/conditional_detr/__init__.py
new file mode 100644
index 0000000000000..1b72e7d7a3238
--- /dev/null
+++ b/src/transformers/models/conditional_detr/__init__.py
@@ -0,0 +1,87 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_timm_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_conditional_detr": [
+        "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "ConditionalDETRConfig",
+        "ConditionalDETROnnxConfig",
+    ]
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["feature_extraction_conditional_detr"] = ["ConditionalDETRFeatureExtractor"]
+
+try:
+    if not is_timm_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_conditional_detr"] = [
+        "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "ConditionalDETRForObjectDetection",
+        "ConditionalDETRForSegmentation",
+        "ConditionalDETRModel",
+        "ConditionalDETRPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_conditional_detr import (
+        CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ConditionalDETRConfig,
+        ConditionalDETROnnxConfig,
+    )
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_conditional_detr import ConditionalDETRFeatureExtractor
+
+    try:
+        if not is_timm_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_conditional_detr import (
+            CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ConditionalDETRForObjectDetection,
+            ConditionalDETRForSegmentation,
+            ConditionalDETRModel,
+            ConditionalDETRPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
new file mode 100644
index 0000000000000..ce004dc6fc15c
--- /dev/null
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -0,0 +1,242 @@
+# coding=utf-8
+# Copyright 2022 Facebook AI Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CONDITIONAL_DETR model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "DeppMeng/ConditionalDETR": "https://huggingface.co/DeppMeng/ConditionalDETR/resolve/main/config.json",
+}
+
+
+class ConditionalDETRConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
+    a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
+    [DeppMeng/ConditionalDETR](https://huggingface.co/DeppMeng/ConditionalDETR) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_queries (`int`, *optional*, defaults to 100):
+            Number of object queries, i.e. detection slots. This is the maximal number of objects
+            [`ConditionalDETRModel`] can detect in a single image. For COCO, we recommend 100 queries.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers.
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        init_xavier_std (`float`, *optional*, defaults to 1):
+            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
+        backbone (`str`, *optional*, defaults to `"resnet50"`):
+            Name of convolutional backbone to use. Supports any convolutional backbone from the timm package. For a
+            list of all available models, see [this
+            page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
+        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
+            Whether to use pretrained weights for the backbone.
+        dilation (`bool`, *optional*, defaults to `False`):
+            Whether to replace stride with dilation in the last convolutional block (DC5).
+        class_cost (`float`, *optional*, defaults to 1):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        mask_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the Focal loss in the panoptic segmentation loss.
+        dice_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.1):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+
+    Examples:
+
+    ```python
+    >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
+
+    >>> # Initializing a CONDITIONAL_DETR DeppMeng/ConditionalDETR style configuration
+    >>> configuration = ConditionalDETRConfig()
+
+    >>> # Initializing a model from the DeppMeng/ConditionalDETR style configuration
+    >>> model = ConditionalDETRModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "conditional_detr"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        num_queries=300,
+        max_position_embeddings=1024,
+        encoder_layers=6,
+        encoder_ffn_dim=2048,
+        encoder_attention_heads=8,
+        decoder_layers=6,
+        decoder_ffn_dim=2048,
+        decoder_attention_heads=8,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        init_xavier_std=1.0,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        auxiliary_loss=False,
+        position_embedding_type="sine",
+        backbone="resnet50",
+        dilation=False,
+        class_cost=2,
+        bbox_cost=5,
+        giou_cost=2,
+        mask_loss_coefficient=1,
+        dice_loss_coefficient=1,
+        cls_loss_coefficient=2,
+        bbox_loss_coefficient=5,
+        giou_loss_coefficient=2,
+        focal_alpha=0.25,
+        **kwargs
+    ):
+        self.num_queries = num_queries
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.auxiliary_loss = auxiliary_loss
+        self.position_embedding_type = position_embedding_type
+        self.backbone = backbone
+        self.dilation = dilation
+        # Hungarian matcher
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        # Loss coefficients
+        self.mask_loss_coefficient = mask_loss_coefficient
+        self.dice_loss_coefficient = dice_loss_coefficient
+        self.cls_loss_coefficient = cls_loss_coefficient
+        self.bbox_loss_coefficient = bbox_loss_coefficient
+        self.giou_loss_coefficient = giou_loss_coefficient
+        self.focal_alpha = focal_alpha
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+
+class ConditionalDETROnnxConfig(OnnxConfig):
+
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("pixel_mask", {0: "batch"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-5
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 12
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000..2d7abc3c088b5
--- /dev/null
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,298 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert CONDITIONAL_DETR checkpoints."""
+
+
+import argparse
+import json
+from collections import OrderedDict
+from pathlib import Path
+
+import torch
+from PIL import Image
+
+import requests
+from huggingface_hub import hf_hub_download
+from transformers import (
+    ConditionalDETRConfig,
+    ConditionalDETRFeatureExtractor,
+    ConditionalDETRForObjectDetection,
+    ConditionalDETRForSegmentation,
+)
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+rename_keys = []
+for i in range(6):
+    # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
+    )
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
+    )
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
+    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
+    )
+    rename_keys.append(
+        (
+            f"transformer.decoder.layers.{i}.cross_attn.out_proj.weight",
+            f"decoder.layers.{i}.encoder_attn.out_proj.weight",
+        )
+    )
+    rename_keys.append(
+        (
+            f"transformer.decoder.layers.{i}.cross_attn.out_proj.bias",
+            f"decoder.layers.{i}.encoder_attn.out_proj.bias",
+        )
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
+
+    # q, k, v projections in self/cross-attention in decoder for conditional DETR
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.weight", f"decoder.layers.{i}.sa_v_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight"))
+    # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.weight", f"decoder.layers.{i}.ca_qpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.weight", f"decoder.layers.{i}.ca_v_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight"))
+
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.bias", f"decoder.layers.{i}.sa_qpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.bias", f"decoder.layers.{i}.sa_kpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.bias", f"decoder.layers.{i}.sa_v_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias"))
+    # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.bias", f"decoder.layers.{i}.ca_qpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.bias", f"decoder.layers.{i}.ca_kpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.bias", f"decoder.layers.{i}.ca_v_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias"))
+
+# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
+# for conditional DETR, also convert reference point head and query scale MLP
+rename_keys.extend(
+    [
+        ("input_proj.weight", "input_projection.weight"),
+        ("input_proj.bias", "input_projection.bias"),
+        ("query_embed.weight", "query_position_embeddings.weight"),
+        ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
+        ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
+        ("class_embed.weight", "class_labels_classifier.weight"),
+        ("class_embed.bias", "class_labels_classifier.bias"),
+        ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
+        ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
+        ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
+        ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
+        ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
+        ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
+        ("transformer.decoder.ref_point_head.layers.0.weight", "decoder.ref_point_head.layers.0.weight"),
+        ("transformer.decoder.ref_point_head.layers.0.bias", "decoder.ref_point_head.layers.0.bias"),
+        ("transformer.decoder.ref_point_head.layers.1.weight", "decoder.ref_point_head.layers.1.weight"),
+        ("transformer.decoder.ref_point_head.layers.1.bias", "decoder.ref_point_head.layers.1.bias"),
+        ("transformer.decoder.query_scale.layers.0.weight", "decoder.query_scale.layers.0.weight"),
+        ("transformer.decoder.query_scale.layers.0.bias", "decoder.query_scale.layers.0.bias"),
+        ("transformer.decoder.query_scale.layers.1.weight", "decoder.query_scale.layers.1.weight"),
+        ("transformer.decoder.query_scale.layers.1.bias", "decoder.query_scale.layers.1.bias"),
+        ("transformer.decoder.layers.0.ca_qpos_proj.weight", "decoder.layers.0.ca_qpos_proj.weight"),
+        ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias")
+    ]
+)
+
+
+def rename_key(state_dict, old, new):
+    val = state_dict.pop(old)
+    state_dict[new] = val
+
+
+def rename_backbone_keys(state_dict):
+    new_state_dict = OrderedDict()
+    for key, value in state_dict.items():
+        if "backbone.0.body" in key:
+            new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
+            new_state_dict[new_key] = value
+        else:
+            new_state_dict[key] = value
+
+    return new_state_dict
+
+
+def read_in_q_k_v(state_dict, is_panoptic=False):
+    prefix = ""
+    if is_panoptic:
+        prefix = "conditional_detr."
+
+    # first: transformer encoder
+    for i in range(6):
+        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+
+    return im
+
+
+@torch.no_grad()
+def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our CONDITIONAL_DETR structure.
+    """
+
+    # load default config
+    config = ConditionalDETRConfig()
+    # set backbone and dilation attributes
+    if "resnet101" in model_name:
+        config.backbone = "resnet101"
+    if "dc5" in model_name:
+        config.dilation = True
+    is_panoptic = "panoptic" in model_name
+    if is_panoptic:
+        config.num_labels = 250
+    else:
+        config.num_labels = 91
+        repo_id = "datasets/huggingface/label-files"
+        filename = "coco-detection-id2label.json"
+        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
+        id2label = {int(k): v for k, v in id2label.items()}
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
+
+    # load feature extractor
+    format = "coco_panoptic" if is_panoptic else "coco_detection"
+    feature_extractor = ConditionalDETRFeatureExtractor(format=format)
+
+    # prepare image
+    img = prepare_img()
+    encoding = feature_extractor(images=img, return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+
+    logger.info(f"Converting model {model_name}...")
+
+    # load original model from torch hub
+    conditional_detr = torch.hub.load("DeppMeng/ConditionalDETR", model_name, pretrained=True).eval()
+    state_dict = conditional_detr.state_dict()
+    # rename keys
+    for src, dest in rename_keys:
+        if is_panoptic:
+            src = "conditional_detr." + src
+        rename_key(state_dict, src, dest)
+    state_dict = rename_backbone_keys(state_dict)
+    # query, key and value matrices need special treatment
+    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
+    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
+    prefix = "conditional_detr.model." if is_panoptic else "model."
+    for key in state_dict.copy().keys():
+        if is_panoptic:
+            if (
+                key.startswith("conditional_detr")
+                and not key.startswith("class_labels_classifier")
+                and not key.startswith("bbox_predictor")
+            ):
+                val = state_dict.pop(key)
+                state_dict["conditional_detr.model" + key[4:]] = val
+            elif "class_labels_classifier" in key or "bbox_predictor" in key:
+                val = state_dict.pop(key)
+                state_dict["conditional_detr." + key] = val
+            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
+                continue
+            else:
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+        else:
+            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+    # finally, create HuggingFace model and load state dict
+    model = ConditionalDETRForSegmentation(config) if is_panoptic else ConditionalDETRForObjectDetection(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+    # verify our conversion
+    original_outputs = conditional_detr(pixel_values)
+    outputs = model(pixel_values)
+    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4)
+    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4)
+    if is_panoptic:
+        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
+
+    # Save model and feature extractor
+    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    model.save_pretrained(pytorch_dump_folder_path)
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_name",
+        default="conditional_detr_resnet50",
+        type=str,
+        help="Name of the CONDITIONAL_DETR model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_conditional_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
new file mode 100644
index 0000000000000..07550124c157e
--- /dev/null
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -0,0 +1,938 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for CONDITIONAL_DETR."""
+
+import io
+import pathlib
+from collections import defaultdict
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+from PIL import Image
+
+from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor
+from ...utils import TensorType, is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+logger = logging.get_logger(__name__)
+
+
+ImageInput = Union[Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"]]
+
+
+# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+def center_to_corners_format(x):
+    """
+    Converts a PyTorch tensor of bounding boxes of center format (center_x, center_y, width, height) to corners format
+    (x_0, y_0, x_1, y_1).
+    """
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def corners_to_center_format(x):
+    """
+    Converts a NumPy array of bounding boxes of shape (number of bounding boxes, 4) of corners format (x_0, y_0, x_1,
+    y_1) to center format (center_x, center_y, width, height).
+    """
+    x_transposed = x.T
+    x0, y0, x1, y1 = x_transposed[0], x_transposed[1], x_transposed[2], x_transposed[3]
+    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
+    return np.stack(b, axis=-1)
+
+
+def masks_to_boxes(masks):
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+
+    Returns a [N, 4] tensor, with the boxes in corner (xyxy) format.
+    """
+    if masks.size == 0:
+        return np.zeros((0, 4))
+
+    h, w = masks.shape[-2:]
+
+    y = np.arange(0, h, dtype=np.float32)
+    x = np.arange(0, w, dtype=np.float32)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = np.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * np.expand_dims(x, axis=0)
+    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
+    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
+    x_min = x.filled(fill_value=1e8)
+    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
+
+    y_mask = masks * np.expand_dims(y, axis=0)
+    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
+    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
+    y_min = y.filled(fill_value=1e8)
+    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
+
+    return np.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
+def rgb_to_id(color):
+    if isinstance(color, np.ndarray) and len(color.shape) == 3:
+        if color.dtype == np.uint8:
+            color = color.astype(np.int32)
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+    return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+
+def id_to_rgb(id_map):
+    if isinstance(id_map, np.ndarray):
+        id_map_copy = id_map.copy()
+        rgb_shape = tuple(list(id_map.shape) + [3])
+        rgb_map = np.zeros(rgb_shape, dtype=np.uint8)
+        for i in range(3):
+            rgb_map[..., i] = id_map_copy % 256
+            id_map_copy //= 256
+        return rgb_map
+    color = []
+    for _ in range(3):
+        color.append(id_map % 256)
+        id_map //= 256
+    return color
+
+
+class ConditionalDETRFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+    r"""
+    Constructs a CONDITIONAL_DETR feature extractor.
+
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+
+    Args:
+        format (`str`, *optional*, defaults to `"coco_detection"`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int`, *optional*, defaults to 800):
+            Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a
+            sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
+            the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
+            height / width, size)`.
+        max_size (`int`, *optional*, defaults to `1333`):
+            The largest size an image dimension can have (otherwise it's capped). Only has an effect if `do_resize` is
+            set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
+            The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
+        image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
+            ImageNet std.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        format="coco_detection",
+        do_resize=True,
+        size=800,
+        max_size=1333,
+        do_normalize=True,
+        image_mean=None,
+        image_std=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.format = self._is_valid_format(format)
+        self.do_resize = do_resize
+        self.size = size
+        self.max_size = max_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.485, 0.456, 0.406]  # ImageNet mean
+        self.image_std = image_std if image_std is not None else [0.229, 0.224, 0.225]  # ImageNet std
+
+    def _is_valid_format(self, format):
+        if format not in ["coco_detection", "coco_panoptic"]:
+            raise ValueError(f"Format {format} not supported")
+        return format
+
+    def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
+        if self.format == "coco_detection":
+            image, target = self.prepare_coco_detection(image, target, return_segmentation_masks)
+            return image, target
+        elif self.format == "coco_panoptic":
+            image, target = self.prepare_coco_panoptic(image, target, masks_path)
+            return image, target
+        else:
+            raise ValueError(f"Format {self.format} not supported")
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33
+    def convert_coco_poly_to_mask(self, segmentations, height, width):
+
+        try:
+            from pycocotools import mask as coco_mask
+        except ImportError:
+            raise ImportError("Pycocotools is not installed in your environment.")
+
+        masks = []
+        for polygons in segmentations:
+            rles = coco_mask.frPyObjects(polygons, height, width)
+            mask = coco_mask.decode(rles)
+            if len(mask.shape) < 3:
+                mask = mask[..., None]
+            mask = np.asarray(mask, dtype=np.uint8)
+            mask = np.any(mask, axis=2)
+            masks.append(mask)
+        if masks:
+            masks = np.stack(masks, axis=0)
+        else:
+            masks = np.zeros((0, height, width), dtype=np.uint8)
+
+        return masks
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50
+    def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
+        """
+        Convert the target in COCO format into the format expected by CONDITIONAL_DETR.
+        """
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = np.asarray([image_id], dtype=np.int64)
+
+        # get all COCO annotations for the given image
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=w)
+        boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = np.asarray(classes, dtype=np.int64)
+
+        if return_segmentation_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = self.convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = np.asarray(keypoints, dtype=np.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.reshape((-1, 3))
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if return_segmentation_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["class_labels"] = classes
+        if return_segmentation_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = np.asarray([obj["area"] for obj in anno], dtype=np.float32)
+        iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno], dtype=np.int64)
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+
+        target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+        target["size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+
+        return image, target
+
+    def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
+        w, h = image.size
+        ann_info = target.copy()
+        ann_path = pathlib.Path(masks_path) / ann_info["file_name"]
+
+        if "segments_info" in ann_info:
+            masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
+            masks = rgb_to_id(masks)
+
+            ids = np.array([ann["id"] for ann in ann_info["segments_info"]])
+            masks = masks == ids[:, None, None]
+            masks = np.asarray(masks, dtype=np.uint8)
+
+            labels = np.asarray([ann["category_id"] for ann in ann_info["segments_info"]], dtype=np.int64)
+
+        target = {}
+        target["image_id"] = np.asarray(
+            [ann_info["image_id"] if "image_id" in ann_info else ann_info["id"]], dtype=np.int64
+        )
+        if return_masks:
+            target["masks"] = masks
+        target["class_labels"] = labels
+
+        target["boxes"] = masks_to_boxes(masks)
+
+        target["size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+        target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+        if "segments_info" in ann_info:
+            target["iscrowd"] = np.asarray([ann["iscrowd"] for ann in ann_info["segments_info"]], dtype=np.int64)
+            target["area"] = np.asarray([ann["area"] for ann in ann_info["segments_info"]], dtype=np.float32)
+
+        return image, target
+
+    def _resize(self, image, size, target=None, max_size=None):
+        """
+        Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
+        edge of the image will be matched to this number.
+
+        If given, also resize the target accordingly.
+        """
+        if not isinstance(image, Image.Image):
+            image = self.to_pil_image(image)
+
+        def get_size_with_aspect_ratio(image_size, size, max_size=None):
+            w, h = image_size
+            if max_size is not None:
+                min_original_size = float(min((w, h)))
+                max_original_size = float(max((w, h)))
+                if max_original_size / min_original_size * size > max_size:
+                    size = int(round(max_size * min_original_size / max_original_size))
+
+            if (w <= h and w == size) or (h <= w and h == size):
+                return (h, w)
+
+            if w < h:
+                ow = size
+                oh = int(size * h / w)
+            else:
+                oh = size
+                ow = int(size * w / h)
+
+            return (oh, ow)
+
+        def get_size(image_size, size, max_size=None):
+            if isinstance(size, (list, tuple)):
+                return size
+            else:
+                # size returned must be (w, h) since we use PIL to resize images
+                # so we revert the tuple
+                return get_size_with_aspect_ratio(image_size, size, max_size)[::-1]
+
+        size = get_size(image.size, size, max_size)
+        rescaled_image = self.resize(image, size=size)
+
+        if target is None:
+            return rescaled_image, None
+
+        ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+        ratio_width, ratio_height = ratios
+
+        target = target.copy()
+        if "boxes" in target:
+            boxes = target["boxes"]
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            target["boxes"] = scaled_boxes
+
+        if "area" in target:
+            area = target["area"]
+            scaled_area = area * (ratio_width * ratio_height)
+            target["area"] = scaled_area
+
+        w, h = size
+        target["size"] = np.asarray([h, w], dtype=np.int64)
+
+        if "masks" in target:
+            # use PyTorch as current workaround
+            # TODO replace by self.resize
+            masks = torch.from_numpy(target["masks"][:, None]).float()
+            interpolated_masks = nn.functional.interpolate(masks, size=(h, w), mode="nearest")[:, 0] > 0.5
+            target["masks"] = interpolated_masks.numpy()
+
+        return rescaled_image, target
+
+    def _normalize(self, image, mean, std, target=None):
+        """
+        Normalize the image with a certain mean and std.
+
+        If given, also normalize the target bounding boxes based on the size of the image.
+        """
+
+        image = self.normalize(image, mean=mean, std=std)
+        if target is None:
+            return image, None
+
+        target = target.copy()
+        h, w = image.shape[-2:]
+
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = corners_to_center_format(boxes)
+            boxes = boxes / np.asarray([w, h, w, h], dtype=np.float32)
+            target["boxes"] = boxes
+
+        return image, target
+
+    def __call__(
+        self,
+        images: ImageInput,
+        annotations: Union[List[Dict], List[List[Dict]]] = None,
+        return_segmentation_masks: Optional[bool] = False,
+        masks_path: Optional[pathlib.Path] = None,
+        pad_and_return_pixel_mask: Optional[bool] = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several image(s) and optional annotations. Images are by default
+        padded up to the largest image in a batch, and a pixel mask is created that indicates which pixels are
+        real/which are padding.
+
+        <Tip warning={true}>
+
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            annotations (`Dict`, `List[Dict]`, *optional*):
+                The corresponding annotations in COCO format.
+
+                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_detection"`, the
+                annotations for each image should have the following format: {'image_id': int, 'annotations':
+                [annotation]}, with the annotations being a list of COCO object annotations.
+
+                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the
+                annotations for each image should have the following format: {'image_id': int, 'file_name': str,
+                'segments_info': [segment_info]} with segments_info being a list of COCO panoptic annotations.
+
+            return_segmentation_masks (`Dict`, `List[Dict]`, *optional*, defaults to `False`):
+                Whether to also include instance segmentation masks as part of the labels in case `format =
+                "coco_detection"`.
+
+            masks_path (`pathlib.Path`, *optional*):
+                Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only
+                relevant in case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
+
+            pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether or not to pad images up to the largest image in a batch and create a pixel mask.
+
+                If left to the default, will return a pixel mask that is:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
+                objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model.
+            - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
+              *"pixel_mask"* is in `self.model_input_names`).
+            - **labels** -- Optional labels to be fed to a model (when `annotations` are provided)
+        """
+        # Input type checking for clearer error
+
+        valid_images = False
+        valid_annotations = False
+        valid_masks_path = False
+
+        # Check that images has a valid type
+        if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
+            valid_images = True
+        elif isinstance(images, (list, tuple)):
+            if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
+                valid_images = True
+
+        if not valid_images:
+            raise ValueError(
+                "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
+                "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
+            )
+
+        is_batched = bool(
+            isinstance(images, (list, tuple))
+            and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
+        )
+
+        # Check that annotations has a valid type
+        if annotations is not None:
+            if not is_batched:
+                if self.format == "coco_detection":
+                    if isinstance(annotations, dict) and "image_id" in annotations and "annotations" in annotations:
+                        if isinstance(annotations["annotations"], (list, tuple)):
+                            # an image can have no annotations
+                            if len(annotations["annotations"]) == 0 or isinstance(annotations["annotations"][0], dict):
+                                valid_annotations = True
+                elif self.format == "coco_panoptic":
+                    if isinstance(annotations, dict) and "image_id" in annotations and "segments_info" in annotations:
+                        if isinstance(annotations["segments_info"], (list, tuple)):
+                            # an image can have no segments (?)
+                            if len(annotations["segments_info"]) == 0 or isinstance(
+                                annotations["segments_info"][0], dict
+                            ):
+                                valid_annotations = True
+            else:
+                if isinstance(annotations, (list, tuple)):
+                    if len(images) != len(annotations):
+                        raise ValueError("There must be as many annotations as there are images")
+                    if isinstance(annotations[0], Dict):
+                        if self.format == "coco_detection":
+                            if isinstance(annotations[0]["annotations"], (list, tuple)):
+                                valid_annotations = True
+                        elif self.format == "coco_panoptic":
+                            if isinstance(annotations[0]["segments_info"], (list, tuple)):
+                                valid_annotations = True
+
+            if not valid_annotations:
+                raise ValueError(
+                    """
+                    Annotations must of type `Dict` (single image) or `List[Dict]` (batch of images). In case of object
+                    detection, each dictionary should contain the keys 'image_id' and 'annotations', with the latter
+                    being a list of annotations in COCO format. In case of panoptic segmentation, each dictionary
+                    should contain the keys 'file_name', 'image_id' and 'segments_info', with the latter being a list
+                    of annotations in COCO format.
+                    """
+                )
+
+        # Check that masks_path has a valid type
+        if masks_path is not None:
+            if self.format == "coco_panoptic":
+                if isinstance(masks_path, pathlib.Path):
+                    valid_masks_path = True
+                if not valid_masks_path:
+                    raise ValueError(
+                        "The path to the directory containing the mask PNG files should be provided as a"
+                        " `pathlib.Path` object."
+                    )
+
+        if not is_batched:
+            images = [images]
+            if annotations is not None:
+                annotations = [annotations]
+
+        # prepare (COCO annotations as a list of Dict -> CONDITIONAL_DETR target as a single Dict per image)
+        if annotations is not None:
+            for idx, (image, target) in enumerate(zip(images, annotations)):
+                if not isinstance(image, Image.Image):
+                    image = self.to_pil_image(image)
+                image, target = self.prepare(image, target, return_segmentation_masks, masks_path)
+                images[idx] = image
+                annotations[idx] = target
+
+        # transformations (resizing + normalization)
+        if self.do_resize and self.size is not None:
+            if annotations is not None:
+                for idx, (image, target) in enumerate(zip(images, annotations)):
+                    image, target = self._resize(image=image, target=target, size=self.size, max_size=self.max_size)
+                    images[idx] = image
+                    annotations[idx] = target
+            else:
+                for idx, image in enumerate(images):
+                    images[idx] = self._resize(image=image, target=None, size=self.size, max_size=self.max_size)[0]
+
+        if self.do_normalize:
+            if annotations is not None:
+                for idx, (image, target) in enumerate(zip(images, annotations)):
+                    image, target = self._normalize(
+                        image=image, mean=self.image_mean, std=self.image_std, target=target
+                    )
+                    images[idx] = image
+                    annotations[idx] = target
+            else:
+                images = [
+                    self._normalize(image=image, mean=self.image_mean, std=self.image_std)[0] for image in images
+                ]
+
+        if pad_and_return_pixel_mask:
+            # pad images up to largest image in batch and create pixel_mask
+            max_size = self._max_by_axis([list(image.shape) for image in images])
+            c, h, w = max_size
+            padded_images = []
+            pixel_mask = []
+            for image in images:
+                # create padded image
+                padded_image = np.zeros((c, h, w), dtype=np.float32)
+                padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
+                padded_images.append(padded_image)
+                # create pixel mask
+                mask = np.zeros((h, w), dtype=np.int64)
+                mask[: image.shape[1], : image.shape[2]] = True
+                pixel_mask.append(mask)
+            images = padded_images
+
+        # return as BatchFeature
+        data = {}
+        data["pixel_values"] = images
+        if pad_and_return_pixel_mask:
+            data["pixel_mask"] = pixel_mask
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        if annotations is not None:
+            # Convert to TensorType
+            tensor_type = return_tensors
+            if not isinstance(tensor_type, TensorType):
+                tensor_type = TensorType(tensor_type)
+
+            if not tensor_type == TensorType.PYTORCH:
+                raise ValueError("Only PyTorch is supported for the moment.")
+            else:
+                if not is_torch_available():
+                    raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
+
+                encoded_inputs["labels"] = [
+                    {k: torch.from_numpy(v) for k, v in target.items()} for target in annotations
+                ]
+
+        return encoded_inputs
+
+    def _max_by_axis(self, the_list):
+        # type: (List[List[int]]) -> List[int]
+        maxes = the_list[0]
+        for sublist in the_list[1:]:
+            for index, item in enumerate(sublist):
+                maxes[index] = max(maxes[index], item)
+        return maxes
+
+    def pad_and_create_pixel_mask(
+        self, pixel_values_list: List["torch.Tensor"], return_tensors: Optional[Union[str, TensorType]] = None
+    ):
+        """
+        Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
+
+        Args:
+            pixel_values_list (`List[torch.Tensor]`):
+                List of images (pixel values) to be padded. Each image should be a tensor of shape (C, H, W).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
+                objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model.
+            - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
+              *"pixel_mask"* is in `self.model_input_names`).
+
+        """
+
+        max_size = self._max_by_axis([list(image.shape) for image in pixel_values_list])
+        c, h, w = max_size
+        padded_images = []
+        pixel_mask = []
+        for image in pixel_values_list:
+            # create padded image
+            padded_image = np.zeros((c, h, w), dtype=np.float32)
+            padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
+            padded_images.append(padded_image)
+            # create pixel mask
+            mask = np.zeros((h, w), dtype=np.int64)
+            mask[: image.shape[1], : image.shape[2]] = True
+            pixel_mask.append(mask)
+
+        # return as BatchFeature
+        data = {"pixel_values": padded_images, "pixel_mask": pixel_mask}
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        return encoded_inputs
+
+    # POSTPROCESSING METHODS
+    # inspired by https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py#L258
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the output of [`ConditionalDETRForObjectDetection`] into the format expected by the COCO api. Only
+        supports PyTorch.
+
+        Args:
+            outputs ([`ConditionalDETRObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation). For visualization, this should be the image size after data
+                augment, but before padding.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        scores = topk_values
+        topk_boxes = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+    def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
+        """
+        Converts the output of [`ConditionalDETRForSegmentation`] into image segmentation predictions. Only supports
+        PyTorch.
+
+        Parameters:
+            outputs ([`ConditionalDETRSegmentationOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
+            threshold (`float`, *optional*, defaults to 0.9):
+                Threshold to use to filter out queries.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, raw_masks = outputs.logits, outputs.pred_masks
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.cpu().tolist())
+
+        for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
+            # we filter empty queries and detection below threshold
+            scores, labels = cur_logits.softmax(-1).max(-1)
+            keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
+            cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
+            cur_scores = cur_scores[keep]
+            cur_classes = cur_classes[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1
+
+            predictions = {"scores": cur_scores, "labels": cur_classes, "masks": cur_masks}
+            preds.append(predictions)
+        return preds
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
+    def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
+        """
+        Converts the output of [`ConditionalDETRForSegmentation`] into actual instance segmentation predictions. Only
+        supports PyTorch.
+
+        Args:
+            results (`List[Dict]`):
+                Results list obtained by [`~ConditionalDETRFeatureExtractor.post_process`], to which "masks" results
+                will be added.
+            outputs ([`ConditionalDETRSegmentationOutput`]):
+                Raw outputs of the model.
+            orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation).
+            max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation).
+            threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an
+            image in the batch as predicted by the model.
+        """
+
+        if len(orig_target_sizes) != len(max_target_sizes):
+            raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes")
+        max_h, max_w = max_target_sizes.max(0)[0].tolist()
+        outputs_masks = outputs.pred_masks.squeeze(2)
+        outputs_masks = nn.functional.interpolate(
+            outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
+        )
+        outputs_masks = (outputs_masks.sigmoid() > threshold).cpu()
+
+        for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
+            img_h, img_w = t[0], t[1]
+            results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
+            results[i]["masks"] = nn.functional.interpolate(
+                results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
+            ).byte()
+
+        return results
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
+    def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
+        """
+        Converts the output of [`ConditionalDETRForSegmentation`] into actual panoptic predictions. Only supports
+        PyTorch.
+
+        Parameters:
+            outputs ([`ConditionalDETRSegmentationOutput`]):
+                Raw outputs of the model.
+            processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
+                augmentation but before batching.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`, *optional*):
+                Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. If left to
+                None, it will default to the `processed_sizes`.
+            is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
+                Dictionary mapping class indices to either True or False, depending on whether or not they are a thing.
+                If not set, defaults to the `is_thing_map` of COCO panoptic.
+            threshold (`float`, *optional*, defaults to 0.85):
+                Threshold to use to filter out queries.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for
+            an image in the batch as predicted by the model.
+        """
+        if target_sizes is None:
+            target_sizes = processed_sizes
+        if len(processed_sizes) != len(target_sizes):
+            raise ValueError("Make sure to pass in as many processed_sizes as target_sizes")
+
+        if is_thing_map is None:
+            # default to is_thing_map of COCO panoptic
+            is_thing_map = {i: i <= 90 for i in range(201)}
+
+        out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes
+        if not len(out_logits) == len(raw_masks) == len(target_sizes):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks"
+            )
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.cpu().tolist())
+
+        for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
+            out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
+        ):
+            # we filter empty queries and detection below threshold
+            scores, labels = cur_logits.softmax(-1).max(-1)
+            keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
+            cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
+            cur_scores = cur_scores[keep]
+            cur_classes = cur_classes[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_boxes = center_to_corners_format(cur_boxes[keep])
+
+            h, w = cur_masks.shape[-2:]
+            if len(cur_boxes) != len(cur_classes):
+                raise ValueError("Not as many boxes as there are classes")
+
+            # It may be that we have several predicted masks for the same stuff class.
+            # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+            cur_masks = cur_masks.flatten(1)
+            stuff_equiv_classes = defaultdict(lambda: [])
+            for k, label in enumerate(cur_classes):
+                if not is_thing_map[label.item()]:
+                    stuff_equiv_classes[label.item()].append(k)
+
+            def get_ids_area(masks, scores, dedup=False):
+                # This helper function creates the final panoptic segmentation image
+                # It also returns the area of the masks that appears on the image
+
+                m_id = masks.transpose(0, 1).softmax(-1)
+
+                if m_id.shape[-1] == 0:
+                    # We didn't detect any mask :(
+                    m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
+                else:
+                    m_id = m_id.argmax(-1).view(h, w)
+
+                if dedup:
+                    # Merge the masks corresponding to the same stuff class
+                    for equiv in stuff_equiv_classes.values():
+                        if len(equiv) > 1:
+                            for eq_id in equiv:
+                                m_id.masked_fill_(m_id.eq(eq_id), equiv[0])
+
+                final_h, final_w = to_tuple(target_size)
+
+                seg_img = Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy()))
+                seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST)
+
+                np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes()))
+                np_seg_img = np_seg_img.view(final_h, final_w, 3)
+                np_seg_img = np_seg_img.numpy()
+
+                m_id = torch.from_numpy(rgb_to_id(np_seg_img))
+
+                area = []
+                for i in range(len(scores)):
+                    area.append(m_id.eq(i).sum().item())
+                return area, seg_img
+
+            area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
+            if cur_classes.numel() > 0:
+                # We know filter empty masks as long as we find some
+                while True:
+                    filtered_small = torch.as_tensor(
+                        [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device
+                    )
+                    if filtered_small.any().item():
+                        cur_scores = cur_scores[~filtered_small]
+                        cur_classes = cur_classes[~filtered_small]
+                        cur_masks = cur_masks[~filtered_small]
+                        area, seg_img = get_ids_area(cur_masks, cur_scores)
+                    else:
+                        break
+
+            else:
+                cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device)
+
+            segments_info = []
+            for i, a in enumerate(area):
+                cat = cur_classes[i].item()
+                segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a})
+            del cur_classes
+
+            with io.BytesIO() as out:
+                seg_img.save(out, format="PNG")
+                predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+            preds.append(predictions)
+        return preds
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
new file mode 100644
index 0000000000000..1f63d22be27a5
--- /dev/null
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -0,0 +1,2547 @@
+# coding=utf-8
+# Copyright 2022 Attn4Vis and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Conditional DETR model."""
+
+
+import math
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_int_div
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_scipy_available,
+    is_timm_available,
+    is_vision_available,
+    logging,
+    replace_return_docstrings,
+    requires_backends,
+)
+from .configuration_conditional_detr import ConditionalDETRConfig
+
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+if is_vision_available():
+    from .feature_extraction_conditional_detr import center_to_corners_format
+
+if is_timm_available():
+    from timm import create_model
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "ConditionalDETRConfig"
+_CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
+
+CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Atten4Vis/ConditionalDETR",
+    # See all conditional_detr models at https://huggingface.co/models?filter=conditional_detr
+]
+
+
+
+@dataclass
+class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
+    """
+    Base class for outputs of the Conditional DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+            layernorm.
+    """
+
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    reference_points: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class ConditionalDETRModelOutput(Seq2SeqModelOutput):
+    """
+    Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+            layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+            layernorm.
+    """
+
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    reference_points: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->ConditionalDETR
+class ConditionalDETRObjectDetectionOutput(ModelOutput):
+    """
+    Output type of [`ConditionalDETRForObjectDetection`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
+            boxes.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+            layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.detr.modeling_detr.DetrSegmentationOutput with Detr->ConditionalDETR
+class ConditionalDETRSegmentationOutput(ModelOutput):
+    """
+    Output type of [`ConditionalDETRForSegmentation`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
+            boxes.
+        pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
+            Segmentation masks logits for all queries. See also [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
+            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation masks
+            respectively.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+            layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    pred_masks: torch.FloatTensor = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+# BELOW: utilities copied from
+# https://github.com/facebookresearch/detr/blob/master/backbone.py
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDETR
+class ConditionalDETRFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super(ConditionalDETRFrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(ConditionalDETRFrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->ConditionalDETR
+def replace_batch_norm(m, name=""):
+    for attr_str in dir(m):
+        target_attr = getattr(m, attr_str)
+        if isinstance(target_attr, nn.BatchNorm2d):
+            frozen = ConditionalDETRFrozenBatchNorm2d(target_attr.num_features)
+            bn = getattr(m, attr_str)
+            frozen.weight.data.copy_(bn.weight)
+            frozen.bias.data.copy_(bn.bias)
+            frozen.running_mean.data.copy_(bn.running_mean)
+            frozen.running_var.data.copy_(bn.running_var)
+            setattr(m, attr_str, frozen)
+    for n, ch in m.named_children():
+        replace_batch_norm(ch, n)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrTimmConvEncoder with Detr->ConditionalDETR
+class ConditionalDETRTimmConvEncoder(nn.Module):
+    """
+    Convolutional encoder (backbone) from the timm library.
+
+    nn.BatchNorm2d layers are replaced by ConditionalDETRFrozenBatchNorm2d as defined above.
+
+    """
+
+    def __init__(self, name: str, dilation: bool):
+        super().__init__()
+
+        kwargs = {}
+        if dilation:
+            kwargs["output_stride"] = 16
+
+        requires_backends(self, ["timm"])
+
+        backbone = create_model(name, pretrained=True, features_only=True, out_indices=(1, 2, 3, 4), **kwargs)
+        # replace batch norm by frozen batch norm
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = self.model.feature_info.channels()
+
+        if "resnet" in name:
+            for name, parameter in self.model.named_parameters():
+                if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                    parameter.requires_grad_(False)
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values)
+
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->ConditionalDETR
+class ConditionalDETRConvModel(nn.Module):
+    """
+    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
+    """
+
+    def __init__(self, conv_encoder, position_embedding):
+        super().__init__()
+        self.conv_encoder = conv_encoder
+        self.position_embedding = position_embedding
+
+    def forward(self, pixel_values, pixel_mask):
+        # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples
+        out = self.conv_encoder(pixel_values, pixel_mask)
+        pos = []
+        for feature_map, mask in out:
+            # position encoding
+            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
+
+        return out, pos
+
+# Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDETR
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrSinePositionEmbedding with Detr->ConditionalDETR
+class ConditionalDETRSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, pixel_values, pixel_mask):
+        if pixel_mask is None:
+            raise ValueError("No pixel mask provided")
+        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
+
+        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
+        dim_t = self.temperature ** (2 * torch_int_div(dim_t, 2) / self.embedding_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->ConditionalDETR
+class ConditionalDETRLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, embedding_dim=256):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+    def forward(self, pixel_values, pixel_mask=None):
+        h, w = pixel_values.shape[-2:]
+        i = torch.arange(w, device=pixel_values.device)
+        j = torch.arange(h, device=pixel_values.device)
+        x_emb = self.column_embeddings(i)
+        y_emb = self.row_embeddings(j)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(h, 1, 1), y_emb.unsqueeze(1).repeat(1, w, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->ConditionalDETR
+def build_position_encoding(config):
+    n_steps = config.d_model // 2
+    if config.position_embedding_type == "sine":
+        # TODO find a better way of exposing other arguments
+        position_embedding = ConditionalDETRSinePositionEmbedding(n_steps, normalize=True)
+    elif config.position_embedding_type == "learned":
+        position_embedding = ConditionalDETRLearnedPositionEmbedding(n_steps)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+
+    return position_embedding
+
+# function to generate sine positional embedding for 2d coordinates
+def gen_sineembed_for_position(pos_tensor):
+    scale = 2 * math.pi
+    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000 ** (2 * (dim_t // 2) / 128)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos = torch.cat((pos_y, pos_x), dim=2)
+    return pos
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1/x2)
+
+class DetrAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the CONDITIONAL_DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        key_value_states: Optional[torch.Tensor] = None,
+        key_value_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # add key-value position embeddings to the key value states
+        if key_value_position_embeddings is not None:
+            key_value_states_original = key_value_states
+            key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states_original), -1, bsz)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states_original), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+class ConditionalDETRAttention(nn.Module):
+    """
+    Cross-Attention used in Conditional DETR 'Conditional DETR for Fast Training Convergence' paper.
+
+    The key q_proj, k_proj, v_proj are defined outside the attention.
+    This attention allows the dim of q, k to be different to v.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        out_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+            )
+        # head dimension of values
+        self.v_head_dim = out_dim // num_heads
+        if self.v_head_dim * num_heads != self.out_dim:
+            raise ValueError(
+                f"out_dim must be divisible by num_heads (got `out_dim`: {self.out_dim} and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.out_proj = nn.Linear(out_dim, out_dim, bias=bias)
+
+    def _qk_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _v_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        key_states: Optional[torch.Tensor] = None,
+        value_states: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = hidden_states * self.scaling
+        # get key, value proj
+        key_states = self._qk_shape(key_states, -1, bsz)
+        value_states = self._v_shape(value_states, -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        v_proj_shape = (bsz * self.num_heads, -1, self.v_head_dim)
+        query_states = self._qk_shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*v_proj_shape)
+
+        src_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.v_head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.v_head_dim)}, but is {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.v_head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.out_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+class ConditionalDETREncoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = DetrAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: torch.Tensor = None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_embeddings (`torch.FloatTensor`, *optional*): position embeddings, to be added to hidden_states.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class ConditionalDETRDecoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        d_model = config.d_model
+        # Decoder Self-Attention projections
+        self.sa_qcontent_proj = nn.Linear(d_model, d_model)
+        self.sa_qpos_proj = nn.Linear(d_model, d_model)
+        self.sa_kcontent_proj = nn.Linear(d_model, d_model)
+        self.sa_kpos_proj = nn.Linear(d_model, d_model)
+        self.sa_v_proj = nn.Linear(d_model, d_model)
+        
+        self.self_attn = ConditionalDETRAttention(
+            embed_dim=self.embed_dim,
+            out_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        
+
+        # Decoder Cross-Attention projections
+        self.ca_qcontent_proj = nn.Linear(d_model, d_model)
+        self.ca_qpos_proj = nn.Linear(d_model, d_model)
+        self.ca_kcontent_proj = nn.Linear(d_model, d_model)
+        self.ca_kpos_proj = nn.Linear(d_model, d_model)
+        self.ca_v_proj = nn.Linear(d_model, d_model)
+        self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)
+
+        self.encoder_attn = ConditionalDETRAttention(
+            self.embed_dim*2,
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.nhead = config.decoder_attention_heads
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        query_position_embeddings: Optional[torch.Tensor] = None,
+        query_sine_embed: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        is_first: Optional[bool] = False
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys
+            in the cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys
+            in the self-attention layer.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # ========== Begin of Self-Attention =============
+        # Apply projections here
+        # shape: num_queries x batch_size x 256
+        q_content = self.sa_qcontent_proj(hidden_states)      # target is the input of the first decoder layer. zero by default.
+        q_pos = self.sa_qpos_proj(query_position_embeddings)
+        k_content = self.sa_kcontent_proj(hidden_states)
+        k_pos = self.sa_kpos_proj(query_position_embeddings)
+        v = self.sa_v_proj(hidden_states)
+
+        bs, num_queries, n_model = q_content.shape
+        _, hw, _ = k_content.shape
+
+        q = q_content + q_pos
+        k = k_content + k_pos
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=q,
+            attention_mask=attention_mask,
+            key_states=k,
+            value_states=v,
+            output_attentions=output_attentions,
+        )
+        # ============ End of Self-Attention =============
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # ========== Begin of Cross-Attention =============
+        # Apply projections here
+        # shape: num_queries x batch_size x 256
+        q_content = self.ca_qcontent_proj(hidden_states)
+        k_content = self.ca_kcontent_proj(encoder_hidden_states)
+        v = self.ca_v_proj(encoder_hidden_states)
+
+        bs, num_queries, n_model = q_content.shape
+        _, hw, _ = k_content.shape
+
+        k_pos = self.ca_kpos_proj(position_embeddings)
+
+        # For the first decoder layer, we concatenate the positional embedding predicted from 
+        # the object query (the positional embedding) into the original query (key) in DETR.
+        if is_first:
+            q_pos = self.ca_qpos_proj(query_position_embeddings)
+            q = q_content + q_pos
+            k = k_content + k_pos
+        else:
+            q = q_content
+            k = k_content
+
+        q = q.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        query_sine_embed = self.ca_qpos_sine_proj(query_sine_embed)
+        query_sine_embed = query_sine_embed.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        q = torch.cat([q, query_sine_embed], dim=3).view(bs, num_queries, n_model * 2)
+        k = k.view(bs, hw, self.nhead, n_model//self.nhead)
+        k_pos = k_pos.view(bs, hw, self.nhead, n_model//self.nhead)
+        k = torch.cat([k, k_pos], dim=3).view(bs, hw, n_model * 2)
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=q,
+                attention_mask=encoder_attention_mask,
+                key_states=k,
+                value_states=v,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # ============ End of Cross-Attention =============
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead with Detr->ConditionalDETR
+class ConditionalDETRClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->ConditionalDETR
+class ConditionalDETRPreTrainedModel(PreTrainedModel):
+    config_class = ConditionalDETRConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        xavier_std = self.config.init_xavier_std
+
+        if isinstance(module, ConditionalDETRMHAttentionMap):
+            nn.init.zeros_(module.k_linear.bias)
+            nn.init.zeros_(module.q_linear.bias)
+            nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
+            nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
+        elif isinstance(module, ConditionalDETRLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, ConditionalDETRDecoder):
+            module.gradient_checkpointing = value
+
+
+CONDITIONAL_DETR_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`ConditionalDETRConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CONDITIONAL_DETR_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it.
+
+            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See [`ConditionalDETRFeatureExtractor.__call__`] for
+            details.
+
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR
+class ConditionalDETREncoder(ConditionalDETRPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`ConditionalDETREncoderLayer`].
+
+    The encoder updates the flattened feature map through multiple self-attention layers.
+
+    Small tweak for CONDITIONAL_DETR:
+
+    - position_embeddings are added to the forward pass.
+
+    Args:
+        config: ConditionalDETRConfig
+    """
+
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        self.layers = nn.ModuleList([ConditionalDETREncoderLayer(config) for _ in range(config.encoder_layers)])
+
+        # in the original CONDITIONAL_DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_embeddings=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+
+                [What are attention masks?](../glossary#attention-mask)
+
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                # we add position_embeddings as extra input to the encoder_layer
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    position_embeddings=position_embeddings,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class ConditionalDETRDecoder(ConditionalDETRPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`ConditionalDETRDecoderLayer`].
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some small tweaks for CONDITIONAL_DETR:
+
+    - position_embeddings and query_position_embeddings are added to the forward pass.
+    - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
+
+    Args:
+        config: ConditionalDETRConfig
+    """
+
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+
+        self.layers = nn.ModuleList([ConditionalDETRDecoderLayer(config) for _ in range(config.decoder_layers)])
+        # in CONDITIONAL_DETR, the decoder uses layernorm after the last decoder layer output
+        self.layernorm = nn.LayerNorm(config.d_model)
+        d_model = config.d_model
+        self.gradient_checkpointing = False
+
+        # query_scale is the FFN applied on f to generate transformation T
+        self.query_scale = MLP(d_model, d_model, d_model, 2)
+        self.ref_point_head = MLP(d_model, d_model, 2, 2)
+        for layer_id in range(config.decoder_layers - 1):
+            self.layers[layer_id + 1].ca_qpos_proj = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings=None,
+        query_position_embeddings=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`:
+
+                - 1 for queries that are **not masked**,
+                - 0 for queries that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Position embeddings that are added to the queries and keys in each cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+            input_shape = inputs_embeds.size()[:-1]
+
+        combined_attention_mask = None
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # optional intermediate hidden states
+        intermediate = () if self.config.auxiliary_loss else None
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        reference_points_before_sigmoid = self.ref_point_head(query_position_embeddings)    # [num_queries, batch_size, 2]
+        reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
+        obj_center = reference_points[..., :2].transpose(0, 1)  
+        # get sine embedding for the query vector
+        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center) 
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+            if idx == 0:
+                pos_transformation = 1
+            else:
+                pos_transformation = self.query_scale(hidden_states)  
+            # apply transformation
+            query_sine_embed = query_sine_embed_before_transformation * pos_transformation
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    combined_attention_mask,
+                    position_embeddings,
+                    query_position_embeddings,
+                    query_sine_embed,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=combined_attention_mask,
+                    position_embeddings=position_embeddings,
+                    query_position_embeddings=query_position_embeddings,
+                    query_sine_embed=query_sine_embed,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    output_attentions=output_attentions,
+                    is_first=(idx==0)
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if self.config.auxiliary_loss:
+                hidden_states = self.layernorm(hidden_states)
+                intermediate += (hidden_states,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # finally, apply layernorm
+        hidden_states = self.layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # stack intermediate decoder activations
+        if self.config.auxiliary_loss:
+            intermediate = torch.stack(intermediate)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate, reference_points]
+                if v is not None
+            )
+        return ConditionalDETRDecoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+            intermediate_hidden_states=intermediate,
+            reference_points=reference_points
+        )
+
+
+@add_start_docstrings(
+    """
+    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
+    any specific head on top.
+    """,
+    CONDITIONAL_DETR_START_DOCSTRING,
+)
+class ConditionalDETRModel(ConditionalDETRPreTrainedModel):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        # Create backbone + positional encoding
+        backbone = ConditionalDETRTimmConvEncoder(config.backbone, config.dilation)
+        position_embeddings = build_position_encoding(config)
+        self.backbone = ConditionalDETRConvModel(backbone, position_embeddings)
+
+        # Create projection layer
+        self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
+
+        self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
+
+        self.encoder = ConditionalDETREncoder(config)
+        self.decoder = ConditionalDETRDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
+
+    @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ConditionalDETRModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), device=device)
+
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # pixel_values should be of shape (batch_size, num_channels, height, width)
+        # pixel_mask should be of shape (batch_size, height, width)
+        features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+
+        # get final feature map and downsampled mask
+        feature_map, mask = features[-1]
+
+        if mask is None:
+            raise ValueError("Backbone does not return downsampled pixel mask")
+
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        projected_feature_map = self.input_projection(feature_map)
+
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
+
+        flattened_mask = mask.flatten(1)
+
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                position_embeddings=position_embeddings,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1)
+        queries = torch.zeros_like(query_position_embeddings)
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            position_embeddings=position_embeddings,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return ConditionalDETRModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            reference_points=decoder_outputs.reference_points
+        )
+
+
+@add_start_docstrings(
+    """
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
+    such as COCO detection.
+    """,
+    CONDITIONAL_DETR_START_DOCSTRING,
+)
+# Copied from transformers.models.detr.modeling_detr.DetrForObjectDetection with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
+class ConditionalDETRForObjectDetection(ConditionalDETRPreTrainedModel):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        # CONDITIONAL_DETR encoder-decoder model
+        self.model = ConditionalDETRModel(config)
+
+        # Object detection heads
+        self.class_labels_classifier = nn.Linear(
+            config.d_model, config.num_labels
+        )  # We add one for the "no object" class
+        self.bbox_predictor = ConditionalDETRMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+
+    @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ConditionalDETRObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForObjectDetection
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> # model predicts bounding boxes and corresponding COCO classes
+        >>> logits = outputs.logits
+        >>> bboxes = outputs.pred_boxes
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # First, sent images through CONDITIONAL_DETR base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        # class logits + predicted bounding boxes
+        logits = self.class_labels_classifier(sequence_output)
+
+        reference = outputs.reference_points if return_dict else outputs[-1]
+        reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
+        outputs_coords = []
+        hs = sequence_output
+        tmp = self.bbox_predictor(hs)
+        tmp[..., :2] += reference_before_sigmoid
+        pred_boxes = tmp.sigmoid()
+        # pred_boxes = self.bbox_predictor(sequence_output).sigmoid()
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = ConditionalDETRHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality"]
+            criterion = ConditionalDETRLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                focal_alpha=self.config.focal_alpha,
+                losses=losses,
+            )
+            criterion.to(self.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            if self.config.auxiliary_loss:
+                intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4]
+                outputs_class = self.class_labels_classifier(intermediate)
+
+                for lvl in range(hs.shape[0]):
+                    tmp = self.bbox_predictor(hs[lvl])
+                    tmp[..., :2] += reference_before_sigmoid
+                    outputs_coord = tmp.sigmoid()
+                    outputs_coords.append(outputs_coord)
+                outputs_coord = torch.stack(outputs_coords)
+
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": self.config.cls_loss_coefficient, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return ConditionalDETRObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
+    such as COCO panoptic.
+
+    """,
+    CONDITIONAL_DETR_START_DOCSTRING,
+)
+# Copied from transformers.models.detr.modeling_detr.DetrForSegmentation with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
+class ConditionalDETRForSegmentation(ConditionalDETRPreTrainedModel):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        # object detection model
+        self.conditional_detr = ConditionalDETRForObjectDetection(config)
+
+        # segmentation head
+        hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
+        intermediate_channel_sizes = self.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes
+
+        self.mask_head = ConditionalDETRMaskHeadSmallConv(
+            hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
+        )
+
+        self.bbox_attention = ConditionalDETRMHAttentionMap(
+            hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ConditionalDETRSegmentationOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each
+            dictionary containing at least the following 3 keys: 'class_labels', 'boxes' and 'masks' (the class labels,
+            bounding boxes and segmentation masks of an image in the batch respectively). The class labels themselves
+            should be a `torch.LongTensor` of len `(number of bounding boxes in the image,)`, the boxes a
+            `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)` and the masks a
+            `torch.FloatTensor` of shape `(number of bounding boxes in the image, height, width)`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
+        >>> model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> # model predicts COCO classes, bounding boxes, and masks
+        >>> logits = outputs.logits
+        >>> bboxes = outputs.pred_boxes
+        >>> masks = outputs.pred_masks
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones((batch_size, height, width), device=device)
+
+        # First, get list of feature maps and position embeddings
+        features, position_embeddings_list = self.conditional_detr.model.backbone(pixel_values, pixel_mask=pixel_mask)
+
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        feature_map, mask = features[-1]
+        batch_size, num_channels, height, width = feature_map.shape
+        projected_feature_map = self.conditional_detr.model.input_projection(feature_map)
+
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
+
+        flattened_mask = mask.flatten(1)
+
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.conditional_detr.model.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                position_embeddings=position_embeddings,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.conditional_detr.model.query_position_embeddings.weight.unsqueeze(0).repeat(
+            batch_size, 1, 1
+        )
+        queries = torch.zeros_like(query_position_embeddings)
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.conditional_detr.model.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            position_embeddings=position_embeddings,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        # Sixth, compute logits, pred_boxes and pred_masks
+        logits = self.conditional_detr.class_labels_classifier(sequence_output)
+        pred_boxes = self.conditional_detr.bbox_predictor(sequence_output).sigmoid()
+
+        memory = encoder_outputs[0].permute(0, 2, 1).view(batch_size, self.config.d_model, height, width)
+        mask = flattened_mask.view(batch_size, height, width)
+
+        # FIXME h_boxes takes the last one computed, keep this in mind
+        # important: we need to reverse the mask, since in the original implementation the mask works reversed
+        # bbox_mask is of shape (batch_size, num_queries, number_of_attention_heads in bbox_attention, height/32, width/32)
+        bbox_mask = self.bbox_attention(sequence_output, memory, mask=~mask)
+
+        seg_masks = self.mask_head(projected_feature_map, bbox_mask, [features[2][0], features[1][0], features[0][0]])
+
+        pred_masks = seg_masks.view(batch_size, self.conditional_detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = ConditionalDETRHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality", "masks"]
+            criterion = ConditionalDETRLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                focal_alpha=self.config.focal_alpha,
+                losses=losses,
+            )
+            criterion.to(self.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            outputs_loss["pred_masks"] = pred_masks
+            if self.config.auxiliary_loss:
+                intermediate = decoder_outputs.intermediate_hidden_states if return_dict else decoder_outputs[-1]
+                outputs_class = self.class_labels_classifier(intermediate)
+                outputs_coord = self.bbox_predictor(intermediate).sigmoid()
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            weight_dict["loss_mask"] = self.config.mask_loss_coefficient
+            weight_dict["loss_dice"] = self.config.dice_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes, pred_masks) + auxiliary_outputs + decoder_outputs + encoder_outputs
+            else:
+                output = (logits, pred_boxes, pred_masks) + decoder_outputs + encoder_outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return ConditionalDETRSegmentationOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            pred_masks=pred_masks,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+def _expand(tensor, length: int):
+    return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1)
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/segmentation.py
+# Copied from transformers.models.detr.modeling_detr.DetrMaskHeadSmallConv with Detr->ConditionalDETR
+class ConditionalDETRMaskHeadSmallConv(nn.Module):
+    """
+    Simple convolutional head, using group norm. Upsampling is done using a FPN approach
+    """
+
+    def __init__(self, dim, fpn_dims, context_dim):
+        super().__init__()
+
+        if dim % 8 != 0:
+            raise ValueError(
+                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in GroupNorm is set to 8"
+            )
+
+        inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
+
+        self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
+        self.gn1 = nn.GroupNorm(8, dim)
+        self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
+        self.gn2 = nn.GroupNorm(8, inter_dims[1])
+        self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
+        self.gn3 = nn.GroupNorm(8, inter_dims[2])
+        self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
+        self.gn4 = nn.GroupNorm(8, inter_dims[3])
+        self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
+        self.gn5 = nn.GroupNorm(8, inter_dims[4])
+        self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)
+
+        self.dim = dim
+
+        self.adapter1 = nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
+        self.adapter2 = nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
+        self.adapter3 = nn.Conv2d(fpn_dims[2], inter_dims[3], 1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
+        # here we concatenate x, the projected feature map, of shape (batch_size, d_model, heigth/32, width/32) with
+        # the bbox_mask = the attention maps of shape (batch_size, n_queries, n_heads, height/32, width/32).
+        # We expand the projected feature map to match the number of heads.
+        x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
+
+        x = self.lay1(x)
+        x = self.gn1(x)
+        x = nn.functional.relu(x)
+        x = self.lay2(x)
+        x = self.gn2(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter1(fpns[0])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay3(x)
+        x = self.gn3(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter2(fpns[1])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay4(x)
+        x = self.gn4(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter3(fpns[2])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay5(x)
+        x = self.gn5(x)
+        x = nn.functional.relu(x)
+
+        x = self.out_lay(x)
+        return x
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->ConditionalDETR
+class ConditionalDETRMHAttentionMap(nn.Module):
+    """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
+
+    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.dropout = nn.Dropout(dropout)
+
+        self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+        self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+
+        self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5
+
+    def forward(self, q, k, mask: Optional[Tensor] = None):
+        q = self.q_linear(q)
+        k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
+        queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
+        keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
+        weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)
+
+        if mask is not None:
+            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf"))
+        weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
+        weights = self.dropout(weights)
+        return weights
+
+
+def dice_loss(inputs, targets, num_boxes):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_boxes
+
+
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+# taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
+class ConditionalDETRLoss(nn.Module):
+    """
+    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process happens in two steps: 1)
+    we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
+    of matched ground-truth / prediction (supervise class and box).
+
+
+
+    Args:
+        matcher (`ConditionalDETRHungarianMatcher`):
+            Module able to compute a matching between targets and proposals.
+        num_classes (`int`):
+            Number of object categories, omitting the special no-object category.
+        focal_alpha (`float`):
+            Alpha parmeter in focal loss.
+        losses (`List[str]`):
+            List of all the losses to be applied. See `get_loss` for a list of all available losses.
+    """
+
+    def __init__(self, matcher, num_classes, focal_alpha, losses):
+        super().__init__()
+        self.matcher = matcher
+        self.num_classes = num_classes
+        self.focal_alpha = focal_alpha
+        self.losses = losses
+
+    # removed logging parameter, which was part of the original implementation
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor of dim
+        [nb_target_boxes]
+        """
+        if "logits" not in outputs:
+            raise KeyError("No logits were found in the outputs")
+        src_logits = outputs["logits"]
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_o
+
+        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2]+1],
+                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:,:,:-1]
+        loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) * src_logits.shape[1]
+        losses = {"loss_ce": loss_ce}
+
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
+
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+        """
+        logits = outputs["logits"]
+        device = logits.device
+        tgt_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+        card_err = nn.functional.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
+
+        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
+        are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        if "pred_boxes" not in outputs:
+            raise KeyError("No predicted boxes found in outputs")
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        loss_bbox = nn.functional.l1_loss(src_boxes, target_boxes, reduction="none")
+
+        losses = {}
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(
+            generalized_box_iou(center_to_corners_format(src_boxes), center_to_corners_format(target_boxes))
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the masks: the focal loss and the dice loss.
+
+        Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
+        """
+        if "pred_masks" not in outputs:
+            raise KeyError("No predicted masks found in outputs")
+
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+
+        # upsample predictions to the target size
+        src_masks = nn.functional.interpolate(
+            src_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
+        )
+        src_masks = src_masks[:, 0].flatten(1)
+
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(src_masks.shape)
+        losses = {
+            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
+            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
+        }
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+            "masks": self.loss_masks,
+        }
+        if loss not in loss_map:
+            raise ValueError(f"Loss {loss} not supported")
+        return loss_map[loss](outputs, targets, indices, num_boxes)
+
+    def forward(self, outputs, targets):
+        """
+        This performs the loss computation.
+
+        Args:
+             outputs (`dict`, *optional*):
+                Dictionary of tensors, see the output specification of the model for the format.
+             targets (`List[dict]`, *optional*):
+                List of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the
+                losses applied, see each loss' doc.
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["class_labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        # (Niels): comment out function below, distributed training to be added
+        # if is_dist_avail_and_initialized():
+        #     torch.distributed.all_reduce(num_boxes)
+        # (Niels) in original implementation, num_boxes is divided by get_world_size()
+        num_boxes = torch.clamp(num_boxes, min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "auxiliary_outputs" in outputs:
+            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
+                indices = self.matcher(auxiliary_outputs, targets)
+                for loss in self.losses:
+                    if loss == "masks":
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->ConditionalDETR,detr->conditional_detr
+class ConditionalDETRMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+# taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/matcher.py
+class ConditionalDETRHungarianMatcher(nn.Module):
+    """
+    This class computes an assignment between the targets and the predictions of the network.
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
+    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+
+    Args:
+        class_cost:
+            The relative weight of the classification error in the matching cost.
+        bbox_cost:
+            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
+        giou_cost:
+            The relative weight of the giou loss of the bounding box in the matching cost.
+    """
+
+    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
+        super().__init__()
+        requires_backends(self, ["scipy"])
+
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        if class_cost == 0 or bbox_cost == 0 or giou_cost == 0:
+            raise ValueError("All costs of the Matcher can't be 0")
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Args:
+            outputs (`dict`):
+                A dictionary that contains at least these entries:
+                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
+            targets (`List[dict]`):
+                A list of targets (len(targets) = batch_size), where each target is a dict containing:
+                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
+                  ground-truth
+                 objects in the target) containing the class labels
+                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
+
+        Returns:
+            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
+            - index_i is the indices of the selected predictions (in order)
+            - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v["class_labels"] for v in targets])
+        tgt_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost.
+        alpha = 0.25
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        class_cost = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
+
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, tgt_bbox, p=1)
+
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(tgt_bbox))
+
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+# below: bounding box utilities taken from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+
+
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
+
+    Args:
+        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
+            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
+            < x2` and `0 <= y1 < y2`.
+
+    Returns:
+        `torch.FloatTensor`: a tensor containing the area for each box.
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
+    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
+
+    Returns:
+        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = box_iou(boxes1, boxes2)
+
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+# below: taken from https://github.com/facebookresearch/detr/blob/master/util/misc.py#L306
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    if tensor_list[0].ndim == 3:
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("Only 3-dimensional tensors are supported")
+    return NestedTensor(tensor, mask)
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index e1f4f3b1fd9fa..f61b8dfdda282 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -24,6 +24,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class ConditionalDETRFeatureExtractor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ConvNextFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/conditional_detr/__init__.py b/tests/models/conditional_detr/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
new file mode 100644
index 0000000000000..b5e19fe005da7
--- /dev/null
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -0,0 +1,338 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import pathlib
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ConditionalDETRFeatureExtractor
+
+
+class ConditionalDETRFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=18,
+        max_size=1333,  # by setting max_size > max_resolution we're effectively not testing this :p
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.max_size = max_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "max_size": self.max_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to ConditionalDETRFeatureExtractor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size * h / w)
+                expected_width = self.size
+            elif w > h:
+                expected_height = self.size
+                expected_width = int(self.size * w / h)
+            else:
+                expected_height = self.size
+                expected_width = self.size
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+
+@require_torch
+@require_vision
+class ConditionalDETRFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+
+    feature_extraction_class = ConditionalDETRFeatureExtractor if is_vision_available() else None
+
+    def setUp(self):
+        self.feature_extract_tester = ConditionalDETRFeatureExtractionTester(self)
+
+    @property
+    def feat_extract_dict(self):
+        return self.feature_extract_tester.prepare_feat_extract_dict()
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "image_mean"))
+        self.assertTrue(hasattr(feature_extractor, "image_std"))
+        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
+        self.assertTrue(hasattr(feature_extractor, "do_resize"))
+        self.assertTrue(hasattr(feature_extractor, "size"))
+        self.assertTrue(hasattr(feature_extractor, "max_size"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PIL images
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random numpy tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_equivalence_pad_and_create_pixel_mask(self):
+        # Initialize feature_extractors
+        feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
+        feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test whether the method "pad_and_return_pixel_mask" and calling the feature extractor return the same tensors
+        encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
+        encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
+
+        assert torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
+        assert torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
+
+    @slow
+    def test_call_pytorch_with_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"image_id": 39769, "annotations": target}
+
+        # encode them
+        feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+
+        # verify area
+        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
+        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+        assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+        # verify class_labels
+        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+        assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
+
+    @slow
+    def test_call_pytorch_with_coco_panoptic_annotations(self):
+        # prepare image, target and masks_path
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+
+        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
+
+        # encode them
+        # TODO replace by .from_pretrained Atten4Vis/ConditionalDETR-panoptic
+        feature_extractor = ConditionalDETRFeatureExtractor(format="coco_panoptic")
+        encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+
+        # verify area
+        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
+        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+        assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+        # verify class_labels
+        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+        assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)
+        # verify masks
+        expected_masks_sum = 822338
+        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
new file mode 100644
index 0000000000000..38933f26c6627
--- /dev/null
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -0,0 +1,538 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch CONDITIONAL_DETR model. """
+
+
+import inspect
+import math
+import unittest
+
+from transformers import ConditionalDETRConfig, is_timm_available, is_vision_available
+from transformers.testing_utils import require_timm, require_vision, slow, torch_device
+from transformers.utils import cached_property
+
+from ...generation.test_generation_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+
+
+if is_timm_available():
+    import torch
+
+    from transformers import ConditionalDETRForObjectDetection, ConditionalDETRForSegmentation, ConditionalDETRModel
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ConditionalDETRFeatureExtractor
+
+
+class ConditionalDETRModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=8,
+        is_training=True,
+        use_labels=True,
+        hidden_size=256,
+        num_hidden_layers=2,
+        num_attention_heads=8,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        num_queries=12,
+        num_channels=3,
+        min_size=200,
+        max_size=200,
+        n_targets=8,
+        num_labels=91,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_queries = num_queries
+        self.num_channels = num_channels
+        self.min_size = min_size
+        self.max_size = max_size
+        self.n_targets = n_targets
+        self.num_labels = num_labels
+
+        # we also set the expected seq length for both encoder and decoder
+        self.encoder_seq_length = math.ceil(self.min_size / 32) * math.ceil(self.max_size / 32)
+        self.decoder_seq_length = self.num_queries
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size])
+
+        pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device)
+
+        labels = None
+        if self.use_labels:
+            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
+            labels = []
+            for i in range(self.batch_size):
+                target = {}
+                target["class_labels"] = torch.randint(
+                    high=self.num_labels, size=(self.n_targets,), device=torch_device
+                )
+                target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device)
+                target["masks"] = torch.rand(self.n_targets, self.min_size, self.max_size, device=torch_device)
+                labels.append(target)
+
+        config = self.get_config()
+        return config, pixel_values, pixel_mask, labels
+
+    def get_config(self):
+        return ConditionalDETRConfig(
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            num_queries=self.num_queries,
+            num_labels=self.num_labels,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
+        return config, inputs_dict
+
+    def create_and_check_conditional_detr_model(self, config, pixel_values, pixel_mask, labels):
+        model = ConditionalDETRModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_conditional_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
+        model = ConditionalDETRForObjectDetection(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
+
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+
+@require_timm
+class ConditionalDETRModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            ConditionalDETRModel,
+            ConditionalDETRForObjectDetection,
+            ConditionalDETRForSegmentation,
+        )
+        if is_timm_available()
+        else ()
+    )
+    is_encoder_decoder = True
+    test_torchscript = False
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+
+    # special case for head models
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ in ["ConditionalDETRForObjectDetection", "ConditionalDETRForSegmentation"]:
+                labels = []
+                for i in range(self.model_tester.batch_size):
+                    target = {}
+                    target["class_labels"] = torch.ones(
+                        size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long
+                    )
+                    target["boxes"] = torch.ones(
+                        self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float
+                    )
+                    target["masks"] = torch.ones(
+                        self.model_tester.n_targets,
+                        self.model_tester.min_size,
+                        self.model_tester.max_size,
+                        device=torch_device,
+                        dtype=torch.float,
+                    )
+                    labels.append(target)
+                inputs_dict["labels"] = labels
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = ConditionalDETRModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ConditionalDETRConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_conditional_detr_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_conditional_detr_model(*config_and_inputs)
+
+    def test_conditional_detr_object_detection_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_conditional_detr_object_detection_head_model(*config_and_inputs)
+
+    @unittest.skip(reason="CONDITIONAL_DETR does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="CONDITIONAL_DETR does not have a get_input_embeddings method")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="CONDITIONAL_DETR is not a generative model")
+    def test_generate_without_input_ids(self):
+        pass
+
+    @unittest.skip(reason="CONDITIONAL_DETR does not use token embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    @slow
+    def test_model_outputs_equivalence(self):
+        # TODO Niels: fix me!
+        pass
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        decoder_seq_length = self.model_tester.decoder_seq_length
+        encoder_seq_length = self.model_tester.encoder_seq_length
+        decoder_key_length = self.model_tester.decoder_seq_length
+        encoder_key_length = self.model_tester.encoder_seq_length
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            if self.is_encoder_decoder:
+                correct_outlen = 6
+
+                # loss is at first position
+                if "labels" in inputs_dict:
+                    correct_outlen += 1  # loss is added to beginning
+                # Object Detection model returns pred_logits and pred_boxes
+                if model_class.__name__ == "ConditionalDETRForObjectDetection":
+                    correct_outlen += 1
+                # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks
+                if model_class.__name__ == "ConditionalDETRForSegmentation":
+                    correct_outlen += 2
+                if "past_key_values" in outputs:
+                    correct_outlen += 1  # past_key_values have been returned
+
+                self.assertEqual(out_len, correct_outlen)
+
+                # decoder attentions
+                decoder_attentions = outputs.decoder_attentions
+                self.assertIsInstance(decoder_attentions, (list, tuple))
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(decoder_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                )
+
+                # cross attentions
+                cross_attentions = outputs.cross_attentions
+                self.assertIsInstance(cross_attentions, (list, tuple))
+                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(cross_attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        decoder_seq_length,
+                        encoder_key_length,
+                    ],
+                )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        output = outputs[0]
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_attentions = outputs.encoder_attentions[0]
+        encoder_hidden_states.retain_grad()
+        encoder_attentions.retain_grad()
+
+        decoder_attentions = outputs.decoder_attentions[0]
+        decoder_attentions.retain_grad()
+
+        cross_attentions = outputs.cross_attentions[0]
+        cross_attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(encoder_attentions.grad)
+        self.assertIsNotNone(decoder_attentions.grad)
+        self.assertIsNotNone(cross_attentions.grad)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model.config.is_encoder_decoder:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                expected_arg_names.extend(
+                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
+                    if "head_mask" and "decoder_head_mask" in arg_names
+                    else []
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+            else:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_different_timm_backbone(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # let's pick a random timm backbone
+        config.backbone = "tf_mobilenetv3_small_075"
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if model_class.__name__ == "ConditionalDETRForObjectDetection":
+                expected_shape = (
+                    self.model_tester.batch_size,
+                    self.model_tester.num_queries,
+                    self.model_tester.num_labels,
+                )
+                self.assertEqual(outputs.logits.shape, expected_shape)
+
+            self.assertTrue(outputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        configs_no_init.init_xavier_std = 1e9
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if "bbox_attention" in name and "bias" not in name:
+                        self.assertLess(
+                            100000,
+                            abs(param.data.max().item()),
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+
+TOLERANCE = 1e-4
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_timm
+@require_vision
+@slow
+class ConditionalDETRModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return (
+            ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+            if is_vision_available()
+            else None
+        )
+
+    def test_inference_no_head(self):
+        model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**encoding)
+
+        expected_shape = torch.Size((1, 300, 256))
+        assert outputs.last_hidden_state.shape == expected_shape
+        expected_slice = torch.tensor(
+            [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
+    def test_inference_object_detection_head(self):
+        model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+        expected_slice_logits = torch.tensor(
+            [[-19.1194, -0.0893, -11.0154], [-17.3640, -1.8035, -14.0219], [-20.0461, -0.5837, -11.1060]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        expected_slice_boxes = torch.tensor(
+            [[0.4433, 0.5302, 0.8853], [0.5494, 0.2517, 0.0529], [0.4998, 0.5360, 0.9956]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+    def test_inference_panoptic_segmentation_head(self):
+        model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+        expected_slice_logits = torch.tensor(
+            [[-18.1565, -1.7568, -13.5029], [-16.8888, -1.4138, -14.1028], [-17.5709, -2.5080, -11.8654]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        expected_slice_boxes = torch.tensor(
+            [[0.5344, 0.1789, 0.9285], [0.4420, 0.0572, 0.0875], [0.6630, 0.6887, 0.1017]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+        expected_shape_masks = torch.Size((1, model.config.num_queries, 200, 267))
+        self.assertEqual(outputs.pred_masks.shape, expected_shape_masks)
+        expected_slice_masks = torch.tensor(
+            [[-7.7558, -10.8788, -11.9797], [-11.8881, -16.4329, -17.7451], [-14.7316, -19.7383, -20.3004]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_masks[0, 0, :3, :3], expected_slice_masks, atol=1e-3))

From f2a47b48de1ea1dfb7cc530dcaf728cfcc24c2ff Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:54:36 -0400
Subject: [PATCH 044/233] checked copies

---
 docs/source/en/index.mdx                      |  2 +-
 .../modeling_conditional_detr.py              | 63 ++++++++++++-------
 2 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 7830c57e6eabd..84d489e22ddf1 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -68,7 +68,7 @@ The documentation is organized into five sections:
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[conditional_detr](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 1f63d22be27a5..4ce1f8f3d5776 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -153,8 +153,8 @@ class ConditionalDETRObjectDetectionOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
-            boxes.
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            unnormalized bounding boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
@@ -217,12 +217,13 @@ class ConditionalDETRSegmentationOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
-            boxes.
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            unnormalized bounding boxes.
         pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
-            Segmentation masks logits for all queries. See also [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
-            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation masks
-            respectively.
+            Segmentation masks logits for all queries. See also
+            [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
+            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation
+            masks respectively.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
@@ -326,7 +327,6 @@ def replace_batch_norm(m, name=""):
         replace_batch_norm(ch, n)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrTimmConvEncoder with Detr->ConditionalDETR
 class ConditionalDETRTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.
@@ -1581,7 +1581,6 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-# Copied from transformers.models.detr.modeling_detr.DetrForObjectDetection with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
 class ConditionalDETRForObjectDetection(ConditionalDETRPreTrainedModel):
     def __init__(self, config: ConditionalDETRConfig):
         super().__init__(config)
@@ -1670,7 +1669,7 @@ def forward(
 
         # class logits + predicted bounding boxes
         logits = self.class_labels_classifier(sequence_output)
-
+        
         reference = outputs.reference_points if return_dict else outputs[-1]
         reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
         outputs_coords = []
@@ -1755,7 +1754,6 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-# Copied from transformers.models.detr.modeling_detr.DetrForSegmentation with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
 class ConditionalDETRForSegmentation(ConditionalDETRPreTrainedModel):
     def __init__(self, config: ConditionalDETRConfig):
         super().__init__(config)
@@ -1807,22 +1805,40 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
-        >>> from PIL import Image
+        >>> import io
         >>> import requests
+        >>> from PIL import Image
+        >>> import torch
+        >>> import numpy
+
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
+        >>> from transformers.models.conditional_detr.feature_extraction_conditional_detr import rgb_to_id
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
-        >>> model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained(
+        ...     "facebook/conditional_detr-resnet-50-panoptic"
+        ... )
+        >>> model = ConditionalDETRForSegmentation.from_pretrained("facebook/conditional_detr-resnet-50-panoptic")
 
+        >>> # prepare image for the model
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
+
+        >>> # forward pass
         >>> outputs = model(**inputs)
-        >>> # model predicts COCO classes, bounding boxes, and masks
-        >>> logits = outputs.logits
-        >>> bboxes = outputs.pred_boxes
-        >>> masks = outputs.pred_masks
+
+        >>> # use the `post_process_panoptic` method of `ConditionalDETRFeatureExtractor` to convert to COCO format
+        >>> processed_sizes = torch.as_tensor(inputs["pixel_values"].shape[-2:]).unsqueeze(0)
+        >>> result = feature_extractor.post_process_panoptic(outputs, processed_sizes)[0]
+
+        >>> # the segmentation is stored in a special-format png
+        >>> panoptic_seg = Image.open(io.BytesIO(result["png_string"]))
+        >>> panoptic_seg = numpy.array(panoptic_seg, dtype=numpy.uint8)
+        >>> # retrieve the ids corresponding to each mask
+        >>> panoptic_seg_id = rgb_to_id(panoptic_seg)
+        >>> panoptic_seg_id.shape
+        (800, 1066)
         ```"""
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1903,7 +1919,9 @@ def forward(
 
         seg_masks = self.mask_head(projected_feature_map, bbox_mask, [features[2][0], features[1][0], features[0][0]])
 
-        pred_masks = seg_masks.view(batch_size, self.conditional_detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+        pred_masks = seg_masks.view(
+            batch_size, self.conditional_detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1]
+        )
 
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
@@ -1985,7 +2003,8 @@ def __init__(self, dim, fpn_dims, context_dim):
 
         if dim % 8 != 0:
             raise ValueError(
-                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in GroupNorm is set to 8"
+                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in"
+                " GroupNorm is set to 8"
             )
 
         inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
@@ -2077,7 +2096,7 @@ def forward(self, q, k, mask: Optional[Tensor] = None):
         weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)
 
         if mask is not None:
-            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf"))
+            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min)
         weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
         weights = self.dropout(weights)
         return weights

From f1a66735030dc79a8f96fbe8a54a9c7efa3f68ce Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:55:14 -0400
Subject: [PATCH 045/233] checked copies

---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 4ce1f8f3d5776..1473df759cd8e 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -2347,7 +2347,6 @@ def forward(self, outputs, targets):
 
 
 # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
-# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->ConditionalDETR,detr->conditional_detr
 class ConditionalDETRMLPPredictionHead(nn.Module):
     """
     Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,

From 533f349f44ec651aeab90f12467bb75b9f7356d0 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 01:22:01 -0400
Subject: [PATCH 046/233] fixed style and copies

---
 docs/source/en/_toctree.yml                   |   8 +-
 .../configuration_conditional_detr.py         |   8 -
 ..._original_pytorch_checkpoint_to_pytorch.py |  54 +++++--
 .../feature_extraction_conditional_detr.py    |   4 +-
 .../modeling_conditional_detr.py              | 144 +++++++++++-------
 5 files changed, 133 insertions(+), 85 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index c21388b60a6f9..f6d6a94eee334 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -42,7 +42,8 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - sections:
+    - isExpanded: false
+      sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -58,7 +59,6 @@
       - local: tasks/multiple_choice
         title: Multiple choice
       title: Task guides
-      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
@@ -362,6 +362,8 @@
       sections:
       - local: model_doc/beit
         title: BEiT
+      - local: model_doc/conditional_detr
+        title: ConditionalDETR
       - local: model_doc/convnext
         title: ConvNeXT
       - local: model_doc/cvt
@@ -501,4 +503,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
\ No newline at end of file
+  title: API
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index ce004dc6fc15c..bfb759ca246ca 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -211,14 +211,6 @@ def num_attention_heads(self) -> int:
     def hidden_size(self) -> int:
         return self.d_model
 
-    @property
-    def num_attention_heads(self) -> int:
-        return self.encoder_attention_heads
-
-    @property
-    def hidden_size(self) -> int:
-        return self.d_model
-
 
 class ConditionalDETROnnxConfig(OnnxConfig):
 
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index 2d7abc3c088b5..4b56d729d1f35 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -94,29 +94,55 @@
     rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
 
     # q, k, v projections in self/cross-attention in decoder for conditional DETR
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.weight", f"decoder.layers.{i}.sa_v_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight")
+    )
     # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.weight", f"decoder.layers.{i}.ca_qpos_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.weight", f"decoder.layers.{i}.ca_v_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight")
+    )
 
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.bias", f"decoder.layers.{i}.sa_qpos_proj.bias"))
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.bias", f"decoder.layers.{i}.sa_kpos_proj.bias"))
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.bias", f"decoder.layers.{i}.sa_v_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias")
+    )
     # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.bias", f"decoder.layers.{i}.ca_qpos_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.bias", f"decoder.layers.{i}.ca_kpos_proj.bias"))
     rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.bias", f"decoder.layers.{i}.ca_v_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias")
+    )
 
 # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
 # for conditional DETR, also convert reference point head and query scale MLP
@@ -144,7 +170,7 @@
         ("transformer.decoder.query_scale.layers.1.weight", "decoder.query_scale.layers.1.weight"),
         ("transformer.decoder.query_scale.layers.1.bias", "decoder.query_scale.layers.1.bias"),
         ("transformer.decoder.layers.0.ca_qpos_proj.weight", "decoder.layers.0.ca_qpos_proj.weight"),
-        ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias")
+        ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias"),
     ]
 )
 
diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 07550124c157e..f37e3e1fb69bf 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -704,8 +704,8 @@ def post_process(self, outputs, target_sizes):
         topk_boxes = topk_indexes // out_logits.shape[2]
         labels = topk_indexes % out_logits.shape[2]
         boxes = center_to_corners_format(out_bbox)
-        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
-        
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
         # and from relative [0, 1] to absolute [0, height] coordinates
         img_h, img_w = target_sizes.unbind(1)
         scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 1473df759cd8e..54dea544f3da9 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -62,13 +62,13 @@
 ]
 
 
-
 @dataclass
 class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
     """
-    Base class for outputs of the Conditional DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
-    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
-    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+    Base class for outputs of the Conditional DETR decoder. This class adds one attribute to
+    BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output
+    of each decoder layer, each of them gone through a layernorm. This is useful when training the model with auxiliary
+    decoding losses.
 
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -97,9 +97,10 @@ class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
 @dataclass
 class ConditionalDETRModelOutput(Seq2SeqModelOutput):
     """
-    Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
-    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
-    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+    Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to
+    Seq2SeqModelOutput, namely an optional stack of intermediate decoder activations, i.e. the output of each decoder
+    layer, each of them gone through a layernorm. This is useful when training the model with auxiliary decoding
+    losses.
 
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -389,6 +390,7 @@ def forward(self, pixel_values, pixel_mask):
 
         return out, pos
 
+
 # Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDETR
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
@@ -479,6 +481,7 @@ def build_position_encoding(config):
 
     return position_embedding
 
+
 # function to generate sine positional embedding for 2d coordinates
 def gen_sineembed_for_position(pos_tensor):
     scale = 2 * math.pi
@@ -493,11 +496,13 @@ def gen_sineembed_for_position(pos_tensor):
     pos = torch.cat((pos_y, pos_x), dim=2)
     return pos
 
+
 def inverse_sigmoid(x, eps=1e-5):
     x = x.clamp(min=0, max=1)
     x1 = x.clamp(min=eps)
     x2 = (1 - x).clamp(min=eps)
-    return torch.log(x1/x2)
+    return torch.log(x1 / x2)
+
 
 class DetrAttention(nn.Module):
     """
@@ -521,7 +526,8 @@ def __init__(
         self.head_dim = embed_dim // num_heads
         if self.head_dim * num_heads != self.embed_dim:
             raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
             )
         self.scaling = self.head_dim**-0.5
 
@@ -585,7 +591,8 @@ def forward(
 
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
             )
 
         if attention_mask is not None:
@@ -614,7 +621,8 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
             )
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
@@ -625,12 +633,13 @@ def forward(
 
         return attn_output, attn_weights_reshaped
 
+
 class ConditionalDETRAttention(nn.Module):
     """
     Cross-Attention used in Conditional DETR 'Conditional DETR for Fast Training Convergence' paper.
 
-    The key q_proj, k_proj, v_proj are defined outside the attention.
-    This attention allows the dim of q, k to be different to v.
+    The key q_proj, k_proj, v_proj are defined outside the attention. This attention allows the dim of q, k to be
+    different to v.
     """
 
     def __init__(
@@ -649,7 +658,8 @@ def __init__(
         self.head_dim = embed_dim // num_heads
         if self.head_dim * num_heads != self.embed_dim:
             raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
             )
         # head dimension of values
         self.v_head_dim = out_dim // num_heads
@@ -663,6 +673,7 @@ def __init__(
 
     def _qk_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
     def _v_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim).transpose(1, 2).contiguous()
 
@@ -696,7 +707,8 @@ def forward(
 
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
             )
 
         if attention_mask is not None:
@@ -725,7 +737,8 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.v_head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.v_head_dim)}, but is {attn_output.size()}"
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.v_head_dim)}, but is"
+                f" {attn_output.size()}"
             )
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.v_head_dim)
@@ -818,19 +831,18 @@ def __init__(self, config: ConditionalDETRConfig):
         self.sa_kcontent_proj = nn.Linear(d_model, d_model)
         self.sa_kpos_proj = nn.Linear(d_model, d_model)
         self.sa_v_proj = nn.Linear(d_model, d_model)
-        
+
         self.self_attn = ConditionalDETRAttention(
             embed_dim=self.embed_dim,
             out_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
-            dropout=config.attention_dropout
+            dropout=config.attention_dropout,
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        
 
         # Decoder Cross-Attention projections
         self.ca_qcontent_proj = nn.Linear(d_model, d_model)
@@ -841,10 +853,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)
 
         self.encoder_attn = ConditionalDETRAttention(
-            self.embed_dim*2,
-            self.embed_dim,
-            config.decoder_attention_heads,
-            dropout=config.attention_dropout
+            self.embed_dim * 2, self.embed_dim, config.decoder_attention_heads, dropout=config.attention_dropout
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
@@ -862,7 +871,7 @@ def forward(
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
-        is_first: Optional[bool] = False
+        is_first: Optional[bool] = False,
     ):
         """
         Args:
@@ -888,7 +897,9 @@ def forward(
         # ========== Begin of Self-Attention =============
         # Apply projections here
         # shape: num_queries x batch_size x 256
-        q_content = self.sa_qcontent_proj(hidden_states)      # target is the input of the first decoder layer. zero by default.
+        q_content = self.sa_qcontent_proj(
+            hidden_states
+        )  # target is the input of the first decoder layer. zero by default.
         q_pos = self.sa_qpos_proj(query_position_embeddings)
         k_content = self.sa_kcontent_proj(hidden_states)
         k_pos = self.sa_kpos_proj(query_position_embeddings)
@@ -924,7 +935,7 @@ def forward(
 
         k_pos = self.ca_kpos_proj(position_embeddings)
 
-        # For the first decoder layer, we concatenate the positional embedding predicted from 
+        # For the first decoder layer, we concatenate the positional embedding predicted from
         # the object query (the positional embedding) into the original query (key) in DETR.
         if is_first:
             q_pos = self.ca_qpos_proj(query_position_embeddings)
@@ -934,12 +945,12 @@ def forward(
             q = q_content
             k = k_content
 
-        q = q.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        q = q.view(bs, num_queries, self.nhead, n_model // self.nhead)
         query_sine_embed = self.ca_qpos_sine_proj(query_sine_embed)
-        query_sine_embed = query_sine_embed.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        query_sine_embed = query_sine_embed.view(bs, num_queries, self.nhead, n_model // self.nhead)
         q = torch.cat([q, query_sine_embed], dim=3).view(bs, num_queries, n_model * 2)
-        k = k.view(bs, hw, self.nhead, n_model//self.nhead)
-        k_pos = k_pos.view(bs, hw, self.nhead, n_model//self.nhead)
+        k = k.view(bs, hw, self.nhead, n_model // self.nhead)
+        k_pos = k_pos.view(bs, hw, self.nhead, n_model // self.nhead)
         k = torch.cat([k, k_pos], dim=3).view(bs, hw, n_model * 2)
 
         # Cross-Attention Block
@@ -996,8 +1007,9 @@ def forward(self, hidden_states: torch.Tensor):
         hidden_states = self.out_proj(hidden_states)
         return hidden_states
 
+
 class MLP(nn.Module):
-    """ Very simple multi-layer perceptron (also called FFN)"""
+    """Very simple multi-layer perceptron (also called FFN)"""
 
     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
         super().__init__()
@@ -1066,8 +1078,8 @@ def _set_gradient_checkpointing(self, module, value=False):
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it.
 
-            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See [`ConditionalDETRFeatureExtractor.__call__`] for
-            details.
+            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See
+            [`ConditionalDETRFeatureExtractor.__call__`] for details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -1323,11 +1335,13 @@ def forward(
         all_self_attns = () if output_attentions else None
         all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
 
-        reference_points_before_sigmoid = self.ref_point_head(query_position_embeddings)    # [num_queries, batch_size, 2]
+        reference_points_before_sigmoid = self.ref_point_head(
+            query_position_embeddings
+        )  # [num_queries, batch_size, 2]
         reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
-        obj_center = reference_points[..., :2].transpose(0, 1)  
+        obj_center = reference_points[..., :2].transpose(0, 1)
         # get sine embedding for the query vector
-        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center) 
+        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center)
 
         for idx, decoder_layer in enumerate(self.layers):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1339,7 +1353,7 @@ def forward(
             if idx == 0:
                 pos_transformation = 1
             else:
-                pos_transformation = self.query_scale(hidden_states)  
+                pos_transformation = self.query_scale(hidden_states)
             # apply transformation
             query_sine_embed = query_sine_embed_before_transformation * pos_transformation
             if self.gradient_checkpointing and self.training:
@@ -1372,7 +1386,7 @@ def custom_forward(*inputs):
                     encoder_hidden_states=encoder_hidden_states,
                     encoder_attention_mask=encoder_attention_mask,
                     output_attentions=output_attentions,
-                    is_first=(idx==0)
+                    is_first=(idx == 0),
                 )
 
             hidden_states = layer_outputs[0]
@@ -1401,7 +1415,14 @@ def custom_forward(*inputs):
         if not return_dict:
             return tuple(
                 v
-                for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate, reference_points]
+                for v in [
+                    hidden_states,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                    intermediate,
+                    reference_points,
+                ]
                 if v is not None
             )
         return ConditionalDETRDecoderOutput(
@@ -1410,14 +1431,14 @@ def custom_forward(*inputs):
             attentions=all_self_attns,
             cross_attentions=all_cross_attentions,
             intermediate_hidden_states=intermediate,
-            reference_points=reference_points
+            reference_points=reference_points,
         )
 
 
 @add_start_docstrings(
     """
-    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
-    any specific head on top.
+    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+    hidden-states without any specific head on top.
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
@@ -1570,14 +1591,14 @@ def forward(
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
             intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
-            reference_points=decoder_outputs.reference_points
+            reference_points=decoder_outputs.reference_points,
         )
 
 
 @add_start_docstrings(
     """
-    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
-    such as COCO detection.
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
+    top, for tasks such as COCO detection.
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
@@ -1669,7 +1690,7 @@ def forward(
 
         # class logits + predicted bounding boxes
         logits = self.class_labels_classifier(sequence_output)
-        
+
         reference = outputs.reference_points if return_dict else outputs[-1]
         reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
         outputs_coords = []
@@ -1748,8 +1769,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
-    such as COCO panoptic.
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top,
+    for tasks such as COCO panoptic.
 
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
@@ -2154,9 +2175,9 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
 # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
 class ConditionalDETRLoss(nn.Module):
     """
-    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process happens in two steps: 1)
-    we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
-    of matched ground-truth / prediction (supervise class and box).
+    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process
+    happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2)
+    we supervise each pair of matched ground-truth / prediction (supervise class and box).
 
 
 
@@ -2181,8 +2202,8 @@ def __init__(self, matcher, num_classes, focal_alpha, losses):
     # removed logging parameter, which was part of the original implementation
     def loss_labels(self, outputs, targets, indices, num_boxes):
         """
-        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor of dim
-        [nb_target_boxes]
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
+        of dim [nb_target_boxes]
         """
         if "logits" not in outputs:
             raise KeyError("No logits were found in the outputs")
@@ -2195,12 +2216,19 @@ def loss_labels(self, outputs, targets, indices, num_boxes):
         )
         target_classes[idx] = target_classes_o
 
-        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2]+1],
-                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
+        target_classes_onehot = torch.zeros(
+            [src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],
+            dtype=src_logits.dtype,
+            layout=src_logits.layout,
+            device=src_logits.device,
+        )
         target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
 
-        target_classes_onehot = target_classes_onehot[:,:,:-1]
-        loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) * src_logits.shape[1]
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = (
+            sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
+            * src_logits.shape[1]
+        )
         losses = {"loss_ce": loss_ce}
 
         return losses
@@ -2430,7 +2458,7 @@ def forward(self, outputs, targets):
         # Compute the classification cost.
         alpha = 0.25
         gamma = 2.0
-        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
         pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
         class_cost = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
 

From 1c69c399643772d3e3fb7297972d9be52fe922c2 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 02:14:09 -0400
Subject: [PATCH 047/233] fixed style and copies

---
 .../conditional_detr/configuration_conditional_detr.py    | 8 ++++----
 ...itional_detr_original_pytorch_checkpoint_to_pytorch.py | 1 +
 utils/check_repo.py                                       | 2 ++
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index bfb759ca246ca..844260f0d12f5 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,7 +27,7 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DeppMeng/ConditionalDETR": "https://huggingface.co/DeppMeng/ConditionalDETR/resolve/main/config.json",
+    "DepuMeng/ConditionalDETR": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
 }
 
 
@@ -36,7 +36,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
     a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
-    [DeppMeng/ConditionalDETR](https://huggingface.co/DeppMeng/ConditionalDETR) architecture.
+    [DepuMeng/ConditionalDETR](https://huggingface.co/DepuMeng/ConditionalDETR) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -114,10 +114,10 @@ class ConditionalDETRConfig(PretrainedConfig):
     ```python
     >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
 
-    >>> # Initializing a CONDITIONAL_DETR DeppMeng/ConditionalDETR style configuration
+    >>> # Initializing a CONDITIONAL_DETR DepuMeng/ConditionalDETR style configuration
     >>> configuration = ConditionalDETRConfig()
 
-    >>> # Initializing a model from the DeppMeng/ConditionalDETR style configuration
+    >>> # Initializing a model from the DepuMeng/ConditionalDETR style configuration
     >>> model = ConditionalDETRModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index 4b56d729d1f35..f3c6fcadb2e2d 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -293,6 +293,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
     model = ConditionalDETRForSegmentation(config) if is_panoptic else ConditionalDETRForObjectDetection(config)
     model.load_state_dict(state_dict)
     model.eval()
+    model.push_to_hub(repo_id=model_name, organization="DepuMeng", commit_message="Add model")
     # verify our conversion
     original_outputs = conditional_detr(pixel_values)
     outputs = model(pixel_values)
diff --git a/utils/check_repo.py b/utils/check_repo.py
index ea3b997f1f126..ffc47aa72af8f 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -60,6 +60,8 @@
     "DetrEncoder",  # Building part of bigger (tested) model.
     "DetrDecoder",  # Building part of bigger (tested) model.
     "DetrDecoderWrapper",  # Building part of bigger (tested) model.
+    "ConditionalDETREncoder",  # Building part of bigger (tested) model.
+    "ConditionalDETRDecoder",  # Building part of bigger (tested) model.
     "M2M100Encoder",  # Building part of bigger (tested) model.
     "M2M100Decoder",  # Building part of bigger (tested) model.
     "MCTCTEncoder",  # Building part of bigger (tested) model.

From 5c56728ecd32b913d7496c0d7aa916722ce79551 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 02:18:56 -0400
Subject: [PATCH 048/233] fixed hub

---
 .../conditional_detr/configuration_conditional_detr.py    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 844260f0d12f5..3c05d959879ec 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,7 +27,7 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DepuMeng/ConditionalDETR": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
+    "DepuMeng/conditional_detr_resnet50": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
 }
 
 
@@ -36,7 +36,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
     a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
-    [DepuMeng/ConditionalDETR](https://huggingface.co/DepuMeng/ConditionalDETR) architecture.
+    [DepuMeng/conditional_detr_resnet50](https://huggingface.co/DepuMeng/conditional_detr_resnet50) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -114,10 +114,10 @@ class ConditionalDETRConfig(PretrainedConfig):
     ```python
     >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
 
-    >>> # Initializing a CONDITIONAL_DETR DepuMeng/ConditionalDETR style configuration
+    >>> # Initializing a CONDITIONAL_DETR DepuMeng/conditional_detr_resnet50 style configuration
     >>> configuration = ConditionalDETRConfig()
 
-    >>> # Initializing a model from the DepuMeng/ConditionalDETR style configuration
+    >>> # Initializing a model from the DepuMeng/conditional_detr_resnet50 style configuration
     >>> model = ConditionalDETRModel(configuration)
 
     >>> # Accessing the model configuration

From 8a2c20b08a80d2f1930e9b584e3b6c224e929cee Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 02:27:32 -0400
Subject: [PATCH 049/233] fixed style

---
 .../models/conditional_detr/configuration_conditional_detr.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 3c05d959879ec..4291a6ecc4a94 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,7 +27,9 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DepuMeng/conditional_detr_resnet50": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
+    "DepuMeng/conditional_detr_resnet50": (
+        "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json"
+    ),
 }
 
 

From 38a9af8ab97b109f1f66d601f5bb1133e47b5ed5 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:33:08 -0400
Subject: [PATCH 050/233] Update README.md

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 986631466858f..fcdae9122142b 100644
--- a/README.md
+++ b/README.md
@@ -276,7 +276,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.

From 7572fced2c95170f562dbf004552daee333485a5 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:33:17 -0400
Subject: [PATCH 051/233] Update docs/source/en/_toctree.yml

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/_toctree.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index f6d6a94eee334..d158be23cea58 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -363,7 +363,7 @@
       - local: model_doc/beit
         title: BEiT
       - local: model_doc/conditional_detr
-        title: ConditionalDETR
+        title: Conditional DETR
       - local: model_doc/convnext
         title: ConvNeXT
       - local: model_doc/cvt

From f9b46b01e570958bf239855169e18610d7c4c9e4 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:33:58 -0400
Subject: [PATCH 052/233] Update docs/source/en/index.mdx

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/index.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 84d489e22ddf1..4d3aa7711503d 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -68,7 +68,7 @@ The documentation is organized into five sections:
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.

From f4cfec688822c7c2ddd5e8503c6e03b08a8ff05a Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:34:10 -0400
Subject: [PATCH 053/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 4291a6ecc4a94..722608a7c945e 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Facebook AI Research and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 3428cb33df952a45efca15648be2921ed1eb8fc2 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:34:32 -0400
Subject: [PATCH 054/233] Update
 src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 ...t_conditional_detr_original_pytorch_checkpoint_to_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index f3c6fcadb2e2d..bcb2bbdda9a7d 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert CONDITIONAL_DETR checkpoints."""
+"""Convert Conditional DETR checkpoints."""
 
 
 import argparse

From febe49aa65f7d897bd8d697e52310fb77bdd6191 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:37:15 -0400
Subject: [PATCH 055/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 722608a7c945e..278ed04c14a35 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" CONDITIONAL_DETR model configuration"""
+""" Conditional DETR model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping

From 2ba7703e41e602b178db24342d26c4f975b9c908 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:48:44 -0400
Subject: [PATCH 056/233] fixed some doc issue

---
 docs/source/en/_toctree.yml                                 | 6 +++---
 src/transformers/models/auto/feature_extraction_auto.py     | 2 --
 .../conditional_detr/configuration_conditional_detr.py      | 4 ++--
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index d158be23cea58..dfbdd503e6005 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -42,8 +42,7 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - isExpanded: false
-      sections:
+    - sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -58,7 +57,8 @@
         title: Summarization
       - local: tasks/multiple_choice
         title: Multiple choice
-      title: Task guides
+      title: Task guides      
+      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 9ba8295b406da..015fd132ef0dc 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,8 +39,6 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
-        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
-        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 278ed04c14a35..2d958a0fbe0f5 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,8 +27,8 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DepuMeng/conditional_detr_resnet50": (
-        "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json"
+    "microsoft/conditional_detr_resnet50": (
+        "https://huggingface.co/microsoft/conditional_detr_resnet50/resolve/main/config.json"
     ),
 }
 

From fb4280f50b1b8f71d1ea3d4f434540a0121ed82d Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:36 -0400
Subject: [PATCH 057/233] Update docs/source/en/model_doc/conditional_detr.mdx

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/model_doc/conditional_detr.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/conditional_detr.mdx b/docs/source/en/model_doc/conditional_detr.mdx
index 51316a24e43ee..53129b77ea05a 100644
--- a/docs/source/en/model_doc/conditional_detr.mdx
+++ b/docs/source/en/model_doc/conditional_detr.mdx
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The conditional DETR model was proposed in [ConditionalDETR](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR.
+The Conditional DETR model was proposed in [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR.
 
 The abstract from the paper is the following:
 

From 81af3def362c7c62ba89db3788472651057deedb Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:42 -0400
Subject: [PATCH 058/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/configuration_conditional_detr.py      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 2d958a0fbe0f5..e2a005215d0e0 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -36,9 +36,9 @@
 class ConditionalDETRConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
-    a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
-    [DepuMeng/conditional_detr_resnet50](https://huggingface.co/DepuMeng/conditional_detr_resnet50) architecture.
+    a Conditional DETR model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Conditional DETR
+    [microsoft/conditional-detr-resnet-50](https://huggingface.co/microsoft/conditional-detr-resnet-50) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.

From c2f9e6fdccbfa5ef02d18d7f11e38838dc171f8e Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:54 -0400
Subject: [PATCH 059/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index e2a005215d0e0..d6013fa7e68ea 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -116,7 +116,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     ```python
     >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
 
-    >>> # Initializing a CONDITIONAL_DETR DepuMeng/conditional_detr_resnet50 style configuration
+    >>> # Initializing a Conditional DETR microsoft/conditional-detr-resnet-50 style configuration
     >>> configuration = ConditionalDETRConfig()
 
     >>> # Initializing a model from the DepuMeng/conditional_detr_resnet50 style configuration

From 4d1c2308fab420b3edb80fd3e306f4574e6bceb4 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:59 -0400
Subject: [PATCH 060/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index d6013fa7e68ea..4e0b4674356d9 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -119,7 +119,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     >>> # Initializing a Conditional DETR microsoft/conditional-detr-resnet-50 style configuration
     >>> configuration = ConditionalDETRConfig()
 
-    >>> # Initializing a model from the DepuMeng/conditional_detr_resnet50 style configuration
+    >>> # Initializing a model from the microsoft/conditional-detr-resnet-50 style configuration
     >>> model = ConditionalDETRModel(configuration)
 
     >>> # Accessing the model configuration

From 82000ae4d0ed6e94802b2b31afeccd11d1aa9b13 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 11:19:56 -0400
Subject: [PATCH 061/233] changed prefix to ConditionalDetr

---
 README.md                                     |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   3 +-
 README_zh-hant.md                             |   3 +-
 docs/source/en/_toctree.yml                   |   4 +-
 docs/source/en/index.mdx                      |   1 +
 docs/source/en/model_doc/conditional_detr.mdx |  20 +-
 src/transformers/__init__.py                  |  24 +-
 .../models/auto/configuration_auto.py         |   2 +-
 src/transformers/models/auto/modeling_auto.py |   4 +-
 .../models/conditional_detr/__init__.py       |  28 +--
 .../configuration_conditional_detr.py         |  14 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |  14 +-
 .../feature_extraction_conditional_detr.py    |  26 +--
 .../modeling_conditional_detr.py              | 212 +++++++++---------
 .../utils/dummy_vision_objects.py             |   2 +-
 ...est_feature_extraction_conditional_detr.py |  16 +-
 .../test_modeling_conditional_detr.py         |  44 ++--
 utils/check_repo.py                           |   4 +-
 19 files changed, 214 insertions(+), 209 deletions(-)

diff --git a/README.md b/README.md
index fcdae9122142b..b6bc04d5b2613 100644
--- a/README.md
+++ b/README.md
@@ -277,6 +277,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_ko.md b/README_ko.md
index ee2d52cf5ac80..b4eddfbbb6563 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -228,6 +228,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index e0897cfbb012b..96f71b0897bf9 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -252,7 +252,8 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang发布。
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 发布。
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index b0536426a45e5..69fa1dc0b9b2b 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -264,7 +264,8 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index dfbdd503e6005..a4cd1005e3e83 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -57,7 +57,7 @@
         title: Summarization
       - local: tasks/multiple_choice
         title: Multiple choice
-      title: Task guides      
+      title: Task guides
       isExpanded: false
     title: Natural Language Processing
   - sections:
@@ -503,4 +503,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
+  title: API
\ No newline at end of file
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 4d3aa7711503d..579c6172d6a6b 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -69,6 +69,7 @@ The documentation is organized into five sections:
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/model_doc/conditional_detr.mdx b/docs/source/en/model_doc/conditional_detr.mdx
index 53129b77ea05a..d5846cbfee327 100644
--- a/docs/source/en/model_doc/conditional_detr.mdx
+++ b/docs/source/en/model_doc/conditional_detr.mdx
@@ -24,30 +24,30 @@ The abstract from the paper is the following:
 This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The original code can be found [here](https://github.com/Atten4Vis/ConditionalDETR).
 
 
-## ConditionalDETRConfig
+## ConditionalDetrConfig
 
-[[autodoc]] ConditionalDETRConfig
+[[autodoc]] ConditionalDetrConfig
 
-## ConditionalDETRFeatureExtractor
+## ConditionalDetrFeatureExtractor
 
-[[autodoc]] ConditionalDETRFeatureExtractor
+[[autodoc]] ConditionalDetrFeatureExtractor
     - __call__
     - pad_and_create_pixel_mask
     - post_process
     - post_process_segmentation
     - post_process_panoptic
 
-## ConditionalDETRModel
+## ConditionalDetrModel
 
-[[autodoc]] ConditionalDETRModel
+[[autodoc]] ConditionalDetrModel
     - forward
 
-## ConditionalDETRForObjectDetection
+## ConditionalDetrForObjectDetection
 
-[[autodoc]] ConditionalDETRForObjectDetection
+[[autodoc]] ConditionalDetrForObjectDetection
     - forward
 
-## ConditionalDETRForSegmentation
+## ConditionalDetrForSegmentation
 
-[[autodoc]] ConditionalDETRForSegmentation
+[[autodoc]] ConditionalDetrForSegmentation
     - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 9da049462d860..af17f29e0ded3 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -172,7 +172,7 @@
         "CLIPVisionConfig",
     ],
     "models.codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenTokenizer"],
-    "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDETRConfig"],
+    "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDetrConfig"],
     "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
     "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
     "models.cpm": [],
@@ -661,7 +661,7 @@
     _import_structure["models.convnext"].append("ConvNextFeatureExtractor")
     _import_structure["models.deit"].append("DeiTFeatureExtractor")
     _import_structure["models.detr"].append("DetrFeatureExtractor")
-    _import_structure["models.conditional_detr"].append("ConditionalDETRFeatureExtractor")
+    _import_structure["models.conditional_detr"].append("ConditionalDetrFeatureExtractor")
     _import_structure["models.donut"].append("DonutFeatureExtractor")
     _import_structure["models.dpt"].append("DPTFeatureExtractor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaProcessor"])
@@ -713,10 +713,10 @@
     _import_structure["models.conditional_detr"].extend(
         [
             "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "ConditionalDETRForObjectDetection",
-            "ConditionalDETRForSegmentation",
-            "ConditionalDETRModel",
-            "ConditionalDETRPreTrainedModel",
+            "ConditionalDetrForObjectDetection",
+            "ConditionalDetrForSegmentation",
+            "ConditionalDetrModel",
+            "ConditionalDetrPreTrainedModel",
         ]
     )
 
@@ -3086,7 +3086,7 @@
         CLIPVisionConfig,
     )
     from .models.codegen import CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP, CodeGenConfig, CodeGenTokenizer
-    from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDETRConfig
+    from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDetrConfig
     from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
     from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
@@ -3510,7 +3510,7 @@
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
         from .models.clip import CLIPFeatureExtractor
-        from .models.conditional_detr import ConditionalDETRFeatureExtractor
+        from .models.conditional_detr import ConditionalDetrFeatureExtractor
         from .models.convnext import ConvNextFeatureExtractor
         from .models.deit import DeiTFeatureExtractor
         from .models.detr import DetrFeatureExtractor
@@ -3548,10 +3548,10 @@
         )
         from .models.conditional_detr import (
             CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ConditionalDETRForObjectDetection,
-            ConditionalDETRForSegmentation,
-            ConditionalDETRModel,
-            ConditionalDETRPreTrainedModel,
+            ConditionalDetrForObjectDetection,
+            ConditionalDetrForSegmentation,
+            ConditionalDetrModel,
+            ConditionalDetrPreTrainedModel,
         )
         from .models.detr import (
             DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index b28bb0665bc3d..258fb5d645bf1 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -43,7 +43,7 @@
         ("canine", "CanineConfig"),
         ("clip", "CLIPConfig"),
         ("codegen", "CodeGenConfig"),
-        ("conditional_detr", "ConditionalDETRConfig"),
+        ("conditional_detr", "ConditionalDetrConfig"),
         ("convbert", "ConvBertConfig"),
         ("convnext", "ConvNextConfig"),
         ("ctrl", "CTRLConfig"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 025db1fa81cbe..072805d6d74b2 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -42,7 +42,7 @@
         ("canine", "CanineModel"),
         ("clip", "CLIPModel"),
         ("codegen", "CodeGenModel"),
-        ("conditional_detr", "ConditionalDETRModel"),
+        ("conditional_detr", "ConditionalDetrModel"),
         ("convbert", "ConvBertModel"),
         ("convnext", "ConvNextModel"),
         ("ctrl", "CTRLModel"),
@@ -374,7 +374,7 @@
     [
         # Do not add new models here, this class will be deprecated in the future.
         # Model for Image Segmentation mapping
-        ("conditional_detr", "ConditionalDETRForSegmentation"),
+        ("conditional_detr", "ConditionalDetrForSegmentation"),
         ("detr", "DetrForSegmentation"),
     ]
 )
diff --git a/src/transformers/models/conditional_detr/__init__.py b/src/transformers/models/conditional_detr/__init__.py
index 1b72e7d7a3238..c2f1bdfdbbaae 100644
--- a/src/transformers/models/conditional_detr/__init__.py
+++ b/src/transformers/models/conditional_detr/__init__.py
@@ -24,8 +24,8 @@
 _import_structure = {
     "configuration_conditional_detr": [
         "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "ConditionalDETRConfig",
-        "ConditionalDETROnnxConfig",
+        "ConditionalDetrConfig",
+        "ConditionalDetrOnnxConfig",
     ]
 }
 
@@ -35,7 +35,7 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["feature_extraction_conditional_detr"] = ["ConditionalDETRFeatureExtractor"]
+    _import_structure["feature_extraction_conditional_detr"] = ["ConditionalDetrFeatureExtractor"]
 
 try:
     if not is_timm_available():
@@ -45,18 +45,18 @@
 else:
     _import_structure["modeling_conditional_detr"] = [
         "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "ConditionalDETRForObjectDetection",
-        "ConditionalDETRForSegmentation",
-        "ConditionalDETRModel",
-        "ConditionalDETRPreTrainedModel",
+        "ConditionalDetrForObjectDetection",
+        "ConditionalDetrForSegmentation",
+        "ConditionalDetrModel",
+        "ConditionalDetrPreTrainedModel",
     ]
 
 
 if TYPE_CHECKING:
     from .configuration_conditional_detr import (
         CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        ConditionalDETRConfig,
-        ConditionalDETROnnxConfig,
+        ConditionalDetrConfig,
+        ConditionalDetrOnnxConfig,
     )
 
     try:
@@ -65,7 +65,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .feature_extraction_conditional_detr import ConditionalDETRFeatureExtractor
+        from .feature_extraction_conditional_detr import ConditionalDetrFeatureExtractor
 
     try:
         if not is_timm_available():
@@ -75,10 +75,10 @@
     else:
         from .modeling_conditional_detr import (
             CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ConditionalDETRForObjectDetection,
-            ConditionalDETRForSegmentation,
-            ConditionalDETRModel,
-            ConditionalDETRPreTrainedModel,
+            ConditionalDetrForObjectDetection,
+            ConditionalDetrForSegmentation,
+            ConditionalDetrModel,
+            ConditionalDetrPreTrainedModel,
         )
 
 else:
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 4e0b4674356d9..6abcf85894735 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -33,9 +33,9 @@
 }
 
 
-class ConditionalDETRConfig(PretrainedConfig):
+class ConditionalDetrConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
+    This is the configuration class to store the configuration of a [`ConditionalDetrModel`]. It is used to instantiate
     a Conditional DETR model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Conditional DETR
     [microsoft/conditional-detr-resnet-50](https://huggingface.co/microsoft/conditional-detr-resnet-50) architecture.
@@ -48,7 +48,7 @@ class ConditionalDETRConfig(PretrainedConfig):
             The number of input channels.
         num_queries (`int`, *optional*, defaults to 100):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
-            [`ConditionalDETRModel`] can detect in a single image. For COCO, we recommend 100 queries.
+            [`ConditionalDetrModel`] can detect in a single image. For COCO, we recommend 100 queries.
         d_model (`int`, *optional*, defaults to 256):
             Dimension of the layers.
         encoder_layers (`int`, *optional*, defaults to 6):
@@ -114,13 +114,13 @@ class ConditionalDETRConfig(PretrainedConfig):
     Examples:
 
     ```python
-    >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
+    >>> from transformers import ConditionalDetrModel, ConditionalDetrConfig
 
     >>> # Initializing a Conditional DETR microsoft/conditional-detr-resnet-50 style configuration
-    >>> configuration = ConditionalDETRConfig()
+    >>> configuration = ConditionalDetrConfig()
 
     >>> # Initializing a model from the microsoft/conditional-detr-resnet-50 style configuration
-    >>> model = ConditionalDETRModel(configuration)
+    >>> model = ConditionalDetrModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
@@ -214,7 +214,7 @@ def hidden_size(self) -> int:
         return self.d_model
 
 
-class ConditionalDETROnnxConfig(OnnxConfig):
+class ConditionalDetrOnnxConfig(OnnxConfig):
 
     torch_onnx_minimum_version = version.parse("1.11")
 
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index bcb2bbdda9a7d..904530c44c227 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -26,10 +26,10 @@
 import requests
 from huggingface_hub import hf_hub_download
 from transformers import (
-    ConditionalDETRConfig,
-    ConditionalDETRFeatureExtractor,
-    ConditionalDETRForObjectDetection,
-    ConditionalDETRForSegmentation,
+    ConditionalDetrConfig,
+    ConditionalDetrFeatureExtractor,
+    ConditionalDetrForObjectDetection,
+    ConditionalDetrForSegmentation,
 )
 from transformers.utils import logging
 
@@ -226,7 +226,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
     """
 
     # load default config
-    config = ConditionalDETRConfig()
+    config = ConditionalDetrConfig()
     # set backbone and dilation attributes
     if "resnet101" in model_name:
         config.backbone = "resnet101"
@@ -246,7 +246,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
 
     # load feature extractor
     format = "coco_panoptic" if is_panoptic else "coco_detection"
-    feature_extractor = ConditionalDETRFeatureExtractor(format=format)
+    feature_extractor = ConditionalDetrFeatureExtractor(format=format)
 
     # prepare image
     img = prepare_img()
@@ -290,7 +290,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
                 val = state_dict.pop(key)
                 state_dict[prefix + key] = val
     # finally, create HuggingFace model and load state dict
-    model = ConditionalDETRForSegmentation(config) if is_panoptic else ConditionalDETRForObjectDetection(config)
+    model = ConditionalDetrForSegmentation(config) if is_panoptic else ConditionalDetrForObjectDetection(config)
     model.load_state_dict(state_dict)
     model.eval()
     model.push_to_hub(repo_id=model_name, organization="DepuMeng", commit_message="Add model")
diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index f37e3e1fb69bf..ec324f1f72456 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -119,7 +119,7 @@ def id_to_rgb(id_map):
     return color
 
 
-class ConditionalDETRFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
     Constructs a CONDITIONAL_DETR feature extractor.
 
@@ -431,11 +431,11 @@ def __call__(
             annotations (`Dict`, `List[Dict]`, *optional*):
                 The corresponding annotations in COCO format.
 
-                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_detection"`, the
+                In case [`ConditionalDetrFeatureExtractor`] was initialized with `format = "coco_detection"`, the
                 annotations for each image should have the following format: {'image_id': int, 'annotations':
                 [annotation]}, with the annotations being a list of COCO object annotations.
 
-                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the
+                In case [`ConditionalDetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the
                 annotations for each image should have the following format: {'image_id': int, 'file_name': str,
                 'segments_info': [segment_info]} with segments_info being a list of COCO panoptic annotations.
 
@@ -445,7 +445,7 @@ def __call__(
 
             masks_path (`pathlib.Path`, *optional*):
                 Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only
-                relevant in case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
+                relevant in case [`ConditionalDetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
 
             pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether or not to pad images up to the largest image in a batch and create a pixel mask.
@@ -676,11 +676,11 @@ def pad_and_create_pixel_mask(
     # inspired by https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py#L258
     def post_process(self, outputs, target_sizes):
         """
-        Converts the output of [`ConditionalDETRForObjectDetection`] into the format expected by the COCO api. Only
+        Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the COCO api. Only
         supports PyTorch.
 
         Args:
-            outputs ([`ConditionalDETRObjectDetectionOutput`]):
+            outputs ([`ConditionalDetrObjectDetectionOutput`]):
                 Raw outputs of the model.
             target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                 Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
@@ -717,11 +717,11 @@ def post_process(self, outputs, target_sizes):
 
     def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
         """
-        Converts the output of [`ConditionalDETRForSegmentation`] into image segmentation predictions. Only supports
+        Converts the output of [`ConditionalDetrForSegmentation`] into image segmentation predictions. Only supports
         PyTorch.
 
         Parameters:
-            outputs ([`ConditionalDETRSegmentationOutput`]):
+            outputs ([`ConditionalDetrSegmentationOutput`]):
                 Raw outputs of the model.
             target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
                 Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
@@ -760,14 +760,14 @@ def to_tuple(tup):
     # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
     def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
         """
-        Converts the output of [`ConditionalDETRForSegmentation`] into actual instance segmentation predictions. Only
+        Converts the output of [`ConditionalDetrForSegmentation`] into actual instance segmentation predictions. Only
         supports PyTorch.
 
         Args:
             results (`List[Dict]`):
-                Results list obtained by [`~ConditionalDETRFeatureExtractor.post_process`], to which "masks" results
+                Results list obtained by [`~ConditionalDetrFeatureExtractor.post_process`], to which "masks" results
                 will be added.
-            outputs ([`ConditionalDETRSegmentationOutput`]):
+            outputs ([`ConditionalDetrSegmentationOutput`]):
                 Raw outputs of the model.
             orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                 Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
@@ -804,11 +804,11 @@ def post_process_instance(self, results, outputs, orig_target_sizes, max_target_
     # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
     def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
         """
-        Converts the output of [`ConditionalDETRForSegmentation`] into actual panoptic predictions. Only supports
+        Converts the output of [`ConditionalDetrForSegmentation`] into actual panoptic predictions. Only supports
         PyTorch.
 
         Parameters:
-            outputs ([`ConditionalDETRSegmentationOutput`]):
+            outputs ([`ConditionalDetrSegmentationOutput`]):
                 Raw outputs of the model.
             processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
                 Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 54dea544f3da9..05343a89f3ed0 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -39,7 +39,7 @@
     replace_return_docstrings,
     requires_backends,
 )
-from .configuration_conditional_detr import ConditionalDETRConfig
+from .configuration_conditional_detr import ConditionalDetrConfig
 
 
 if is_scipy_available():
@@ -53,7 +53,7 @@
 
 logger = logging.get_logger(__name__)
 
-_CONFIG_FOR_DOC = "ConditionalDETRConfig"
+_CONFIG_FOR_DOC = "ConditionalDetrConfig"
 _CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
 
 CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -63,7 +63,7 @@
 
 
 @dataclass
-class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
+class ConditionalDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
     """
     Base class for outputs of the Conditional DETR decoder. This class adds one attribute to
     BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output
@@ -95,7 +95,7 @@ class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
 
 
 @dataclass
-class ConditionalDETRModelOutput(Seq2SeqModelOutput):
+class ConditionalDetrModelOutput(Seq2SeqModelOutput):
     """
     Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to
     Seq2SeqModelOutput, namely an optional stack of intermediate decoder activations, i.e. the output of each decoder
@@ -137,10 +137,10 @@ class ConditionalDETRModelOutput(Seq2SeqModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->ConditionalDETR
-class ConditionalDETRObjectDetectionOutput(ModelOutput):
+# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->ConditionalDetr
+class ConditionalDetrObjectDetectionOutput(ModelOutput):
     """
-    Output type of [`ConditionalDETRForObjectDetection`].
+    Output type of [`ConditionalDetrForObjectDetection`].
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
@@ -154,7 +154,7 @@ class ConditionalDETRObjectDetectionOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            possible padding). You can use [`~ConditionalDetrFeatureExtractor.post_process`] to retrieve the
             unnormalized bounding boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
@@ -201,10 +201,10 @@ class ConditionalDETRObjectDetectionOutput(ModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.detr.modeling_detr.DetrSegmentationOutput with Detr->ConditionalDETR
-class ConditionalDETRSegmentationOutput(ModelOutput):
+# Copied from transformers.models.detr.modeling_detr.DetrSegmentationOutput with Detr->ConditionalDetr
+class ConditionalDetrSegmentationOutput(ModelOutput):
     """
-    Output type of [`ConditionalDETRForSegmentation`].
+    Output type of [`ConditionalDetrForSegmentation`].
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
@@ -218,12 +218,12 @@ class ConditionalDETRSegmentationOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            possible padding). You can use [`~ConditionalDetrFeatureExtractor.post_process`] to retrieve the
             unnormalized bounding boxes.
         pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
             Segmentation masks logits for all queries. See also
-            [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
-            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation
+            [`~ConditionalDetrFeatureExtractor.post_process_segmentation`] or
+            [`~ConditionalDetrFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation
             masks respectively.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
@@ -272,8 +272,8 @@ class ConditionalDETRSegmentationOutput(ModelOutput):
 
 # BELOW: utilities copied from
 # https://github.com/facebookresearch/detr/blob/master/backbone.py
-# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDETR
-class ConditionalDETRFrozenBatchNorm2d(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDetr
+class ConditionalDetrFrozenBatchNorm2d(nn.Module):
     """
     BatchNorm2d where the batch statistics and the affine parameters are fixed.
 
@@ -282,7 +282,7 @@ class ConditionalDETRFrozenBatchNorm2d(nn.Module):
     """
 
     def __init__(self, n):
-        super(ConditionalDETRFrozenBatchNorm2d, self).__init__()
+        super(ConditionalDetrFrozenBatchNorm2d, self).__init__()
         self.register_buffer("weight", torch.ones(n))
         self.register_buffer("bias", torch.zeros(n))
         self.register_buffer("running_mean", torch.zeros(n))
@@ -295,7 +295,7 @@ def _load_from_state_dict(
         if num_batches_tracked_key in state_dict:
             del state_dict[num_batches_tracked_key]
 
-        super(ConditionalDETRFrozenBatchNorm2d, self)._load_from_state_dict(
+        super(ConditionalDetrFrozenBatchNorm2d, self)._load_from_state_dict(
             state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
         )
 
@@ -312,12 +312,12 @@ def forward(self, x):
         return x * scale + bias
 
 
-# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->ConditionalDETR
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->ConditionalDetr
 def replace_batch_norm(m, name=""):
     for attr_str in dir(m):
         target_attr = getattr(m, attr_str)
         if isinstance(target_attr, nn.BatchNorm2d):
-            frozen = ConditionalDETRFrozenBatchNorm2d(target_attr.num_features)
+            frozen = ConditionalDetrFrozenBatchNorm2d(target_attr.num_features)
             bn = getattr(m, attr_str)
             frozen.weight.data.copy_(bn.weight)
             frozen.bias.data.copy_(bn.bias)
@@ -328,11 +328,11 @@ def replace_batch_norm(m, name=""):
         replace_batch_norm(ch, n)
 
 
-class ConditionalDETRTimmConvEncoder(nn.Module):
+class ConditionalDetrTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.
 
-    nn.BatchNorm2d layers are replaced by ConditionalDETRFrozenBatchNorm2d as defined above.
+    nn.BatchNorm2d layers are replaced by ConditionalDetrFrozenBatchNorm2d as defined above.
 
     """
 
@@ -369,8 +369,8 @@ def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         return out
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->ConditionalDETR
-class ConditionalDETRConvModel(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->ConditionalDetr
+class ConditionalDetrConvModel(nn.Module):
     """
     This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
     """
@@ -391,7 +391,7 @@ def forward(self, pixel_values, pixel_mask):
         return out, pos
 
 
-# Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDETR
+# Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDetr
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
@@ -406,8 +406,8 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrSinePositionEmbedding with Detr->ConditionalDETR
-class ConditionalDETRSinePositionEmbedding(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrSinePositionEmbedding with Detr->ConditionalDetr
+class ConditionalDetrSinePositionEmbedding(nn.Module):
     """
     This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
     need paper, generalized to work on images.
@@ -444,8 +444,8 @@ def forward(self, pixel_values, pixel_mask):
         return pos
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->ConditionalDETR
-class ConditionalDETRLearnedPositionEmbedding(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->ConditionalDetr
+class ConditionalDetrLearnedPositionEmbedding(nn.Module):
     """
     This module learns positional embeddings up to a fixed maximum size.
     """
@@ -468,14 +468,14 @@ def forward(self, pixel_values, pixel_mask=None):
         return pos
 
 
-# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->ConditionalDETR
+# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->ConditionalDetr
 def build_position_encoding(config):
     n_steps = config.d_model // 2
     if config.position_embedding_type == "sine":
         # TODO find a better way of exposing other arguments
-        position_embedding = ConditionalDETRSinePositionEmbedding(n_steps, normalize=True)
+        position_embedding = ConditionalDetrSinePositionEmbedding(n_steps, normalize=True)
     elif config.position_embedding_type == "learned":
-        position_embedding = ConditionalDETRLearnedPositionEmbedding(n_steps)
+        position_embedding = ConditionalDetrLearnedPositionEmbedding(n_steps)
     else:
         raise ValueError(f"Not supported {config.position_embedding_type}")
 
@@ -634,7 +634,7 @@ def forward(
         return attn_output, attn_weights_reshaped
 
 
-class ConditionalDETRAttention(nn.Module):
+class ConditionalDetrAttention(nn.Module):
     """
     Cross-Attention used in Conditional DETR 'Conditional DETR for Fast Training Convergence' paper.
 
@@ -750,8 +750,8 @@ def forward(
         return attn_output, attn_weights_reshaped
 
 
-class ConditionalDETREncoderLayer(nn.Module):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrEncoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__()
         self.embed_dim = config.d_model
         self.self_attn = DetrAttention(
@@ -819,8 +819,8 @@ def forward(
         return outputs
 
 
-class ConditionalDETRDecoderLayer(nn.Module):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrDecoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__()
         self.embed_dim = config.d_model
 
@@ -832,7 +832,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.sa_kpos_proj = nn.Linear(d_model, d_model)
         self.sa_v_proj = nn.Linear(d_model, d_model)
 
-        self.self_attn = ConditionalDETRAttention(
+        self.self_attn = ConditionalDetrAttention(
             embed_dim=self.embed_dim,
             out_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
@@ -852,7 +852,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.ca_v_proj = nn.Linear(d_model, d_model)
         self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)
 
-        self.encoder_attn = ConditionalDETRAttention(
+        self.encoder_attn = ConditionalDetrAttention(
             self.embed_dim * 2, self.embed_dim, config.decoder_attention_heads, dropout=config.attention_dropout
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
@@ -989,8 +989,8 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead with Detr->ConditionalDETR
-class ConditionalDETRClassificationHead(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead with Detr->ConditionalDetr
+class ConditionalDetrClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
 
     def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
@@ -1023,9 +1023,9 @@ def forward(self, x):
         return x
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->ConditionalDETR
-class ConditionalDETRPreTrainedModel(PreTrainedModel):
-    config_class = ConditionalDETRConfig
+# Copied from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->ConditionalDetr
+class ConditionalDetrPreTrainedModel(PreTrainedModel):
+    config_class = ConditionalDetrConfig
     base_model_prefix = "model"
     main_input_name = "pixel_values"
 
@@ -1033,12 +1033,12 @@ def _init_weights(self, module):
         std = self.config.init_std
         xavier_std = self.config.init_xavier_std
 
-        if isinstance(module, ConditionalDETRMHAttentionMap):
+        if isinstance(module, ConditionalDetrMHAttentionMap):
             nn.init.zeros_(module.k_linear.bias)
             nn.init.zeros_(module.q_linear.bias)
             nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
             nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
-        elif isinstance(module, ConditionalDETRLearnedPositionEmbedding):
+        elif isinstance(module, ConditionalDetrLearnedPositionEmbedding):
             nn.init.uniform_(module.row_embeddings.weight)
             nn.init.uniform_(module.column_embeddings.weight)
         if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
@@ -1053,7 +1053,7 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, ConditionalDETRDecoder):
+        if isinstance(module, ConditionalDetrDecoder):
             module.gradient_checkpointing = value
 
 
@@ -1067,7 +1067,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     and behavior.
 
     Parameters:
-        config ([`ConditionalDETRConfig`]):
+        config ([`ConditionalDetrConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1078,8 +1078,8 @@ def _set_gradient_checkpointing(self, module, value=False):
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it.
 
-            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See
-            [`ConditionalDETRFeatureExtractor.__call__`] for details.
+            Pixel values can be obtained using [`ConditionalDetrFeatureExtractor`]. See
+            [`ConditionalDetrFeatureExtractor.__call__`] for details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -1112,11 +1112,11 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR
-class ConditionalDETREncoder(ConditionalDETRPreTrainedModel):
+# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDetr
+class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
-    [`ConditionalDETREncoderLayer`].
+    [`ConditionalDetrEncoderLayer`].
 
     The encoder updates the flattened feature map through multiple self-attention layers.
 
@@ -1125,16 +1125,16 @@ class ConditionalDETREncoder(ConditionalDETRPreTrainedModel):
     - position_embeddings are added to the forward pass.
 
     Args:
-        config: ConditionalDETRConfig
+        config: ConditionalDetrConfig
     """
 
-    def __init__(self, config: ConditionalDETRConfig):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         self.dropout = config.dropout
         self.layerdrop = config.encoder_layerdrop
 
-        self.layers = nn.ModuleList([ConditionalDETREncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layers = nn.ModuleList([ConditionalDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
 
         # in the original CONDITIONAL_DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
 
@@ -1222,9 +1222,9 @@ def forward(
         )
 
 
-class ConditionalDETRDecoder(ConditionalDETRPreTrainedModel):
+class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
     """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`ConditionalDETRDecoderLayer`].
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`ConditionalDetrDecoderLayer`].
 
     The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
 
@@ -1234,15 +1234,15 @@ class ConditionalDETRDecoder(ConditionalDETRPreTrainedModel):
     - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
 
     Args:
-        config: ConditionalDETRConfig
+        config: ConditionalDetrConfig
     """
 
-    def __init__(self, config: ConditionalDETRConfig):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
 
-        self.layers = nn.ModuleList([ConditionalDETRDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layers = nn.ModuleList([ConditionalDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
         # in CONDITIONAL_DETR, the decoder uses layernorm after the last decoder layer output
         self.layernorm = nn.LayerNorm(config.d_model)
         d_model = config.d_model
@@ -1425,7 +1425,7 @@ def custom_forward(*inputs):
                 ]
                 if v is not None
             )
-        return ConditionalDETRDecoderOutput(
+        return ConditionalDetrDecoderOutput(
             last_hidden_state=hidden_states,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
@@ -1442,22 +1442,22 @@ def custom_forward(*inputs):
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-class ConditionalDETRModel(ConditionalDETRPreTrainedModel):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrModel(ConditionalDetrPreTrainedModel):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # Create backbone + positional encoding
-        backbone = ConditionalDETRTimmConvEncoder(config.backbone, config.dilation)
+        backbone = ConditionalDetrTimmConvEncoder(config.backbone, config.dilation)
         position_embeddings = build_position_encoding(config)
-        self.backbone = ConditionalDETRConvModel(backbone, position_embeddings)
+        self.backbone = ConditionalDetrConvModel(backbone, position_embeddings)
 
         # Create projection layer
         self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
 
         self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
 
-        self.encoder = ConditionalDETREncoder(config)
-        self.decoder = ConditionalDETRDecoder(config)
+        self.encoder = ConditionalDetrEncoder(config)
+        self.decoder = ConditionalDetrDecoder(config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1477,7 +1477,7 @@ def unfreeze_backbone(self):
             param.requires_grad_(True)
 
     @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=ConditionalDETRModelOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=ConditionalDetrModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values,
@@ -1496,15 +1496,15 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRModel
+        >>> from transformers import ConditionalDetrFeatureExtractor, ConditionalDetrModel
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR")
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
@@ -1582,7 +1582,7 @@ def forward(
         if not return_dict:
             return decoder_outputs + encoder_outputs
 
-        return ConditionalDETRModelOutput(
+        return ConditionalDetrModelOutput(
             last_hidden_state=decoder_outputs.last_hidden_state,
             decoder_hidden_states=decoder_outputs.hidden_states,
             decoder_attentions=decoder_outputs.attentions,
@@ -1602,18 +1602,18 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-class ConditionalDETRForObjectDetection(ConditionalDETRPreTrainedModel):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # CONDITIONAL_DETR encoder-decoder model
-        self.model = ConditionalDETRModel(config)
+        self.model = ConditionalDetrModel(config)
 
         # Object detection heads
         self.class_labels_classifier = nn.Linear(
             config.d_model, config.num_labels
         )  # We add one for the "no object" class
-        self.bbox_predictor = ConditionalDETRMLPPredictionHead(
+        self.bbox_predictor = ConditionalDetrMLPPredictionHead(
             input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
         )
 
@@ -1629,7 +1629,7 @@ def _set_aux_loss(self, outputs_class, outputs_coord):
         return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
 
     @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=ConditionalDETRObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=ConditionalDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values,
@@ -1655,15 +1655,15 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForObjectDetection
+        >>> from transformers import ConditionalDetrFeatureExtractor, ConditionalDetrForObjectDetection
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1703,12 +1703,12 @@ def forward(
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
             # First: create the matcher
-            matcher = ConditionalDETRHungarianMatcher(
+            matcher = ConditionalDetrHungarianMatcher(
                 class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
             )
             # Second: create the criterion
             losses = ["labels", "boxes", "cardinality"]
-            criterion = ConditionalDETRLoss(
+            criterion = ConditionalDetrLoss(
                 matcher=matcher,
                 num_classes=self.config.num_labels,
                 focal_alpha=self.config.focal_alpha,
@@ -1751,7 +1751,7 @@ def forward(
                 output = (logits, pred_boxes) + outputs
             return ((loss, loss_dict) + output) if loss is not None else output
 
-        return ConditionalDETRObjectDetectionOutput(
+        return ConditionalDetrObjectDetectionOutput(
             loss=loss,
             loss_dict=loss_dict,
             logits=logits,
@@ -1775,22 +1775,22 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-class ConditionalDETRForSegmentation(ConditionalDETRPreTrainedModel):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrForSegmentation(ConditionalDetrPreTrainedModel):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # object detection model
-        self.conditional_detr = ConditionalDETRForObjectDetection(config)
+        self.conditional_detr = ConditionalDetrForObjectDetection(config)
 
         # segmentation head
         hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
         intermediate_channel_sizes = self.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes
 
-        self.mask_head = ConditionalDETRMaskHeadSmallConv(
+        self.mask_head = ConditionalDetrMaskHeadSmallConv(
             hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
         )
 
-        self.bbox_attention = ConditionalDETRMHAttentionMap(
+        self.bbox_attention = ConditionalDetrMHAttentionMap(
             hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
         )
 
@@ -1798,7 +1798,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.post_init()
 
     @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=ConditionalDETRSegmentationOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=ConditionalDetrSegmentationOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values,
@@ -1832,16 +1832,16 @@ def forward(
         >>> import torch
         >>> import numpy
 
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
+        >>> from transformers import ConditionalDetrFeatureExtractor, ConditionalDetrForSegmentation
         >>> from transformers.models.conditional_detr.feature_extraction_conditional_detr import rgb_to_id
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained(
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained(
         ...     "facebook/conditional_detr-resnet-50-panoptic"
         ... )
-        >>> model = ConditionalDETRForSegmentation.from_pretrained("facebook/conditional_detr-resnet-50-panoptic")
+        >>> model = ConditionalDetrForSegmentation.from_pretrained("facebook/conditional_detr-resnet-50-panoptic")
 
         >>> # prepare image for the model
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
@@ -1849,7 +1849,7 @@ def forward(
         >>> # forward pass
         >>> outputs = model(**inputs)
 
-        >>> # use the `post_process_panoptic` method of `ConditionalDETRFeatureExtractor` to convert to COCO format
+        >>> # use the `post_process_panoptic` method of `ConditionalDetrFeatureExtractor` to convert to COCO format
         >>> processed_sizes = torch.as_tensor(inputs["pixel_values"].shape[-2:]).unsqueeze(0)
         >>> result = feature_extractor.post_process_panoptic(outputs, processed_sizes)[0]
 
@@ -1947,12 +1947,12 @@ def forward(
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
             # First: create the matcher
-            matcher = ConditionalDETRHungarianMatcher(
+            matcher = ConditionalDetrHungarianMatcher(
                 class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
             )
             # Second: create the criterion
             losses = ["labels", "boxes", "cardinality", "masks"]
-            criterion = ConditionalDETRLoss(
+            criterion = ConditionalDetrLoss(
                 matcher=matcher,
                 num_classes=self.config.num_labels,
                 focal_alpha=self.config.focal_alpha,
@@ -1991,7 +1991,7 @@ def forward(
                 output = (logits, pred_boxes, pred_masks) + decoder_outputs + encoder_outputs
             return ((loss, loss_dict) + output) if loss is not None else output
 
-        return ConditionalDETRSegmentationOutput(
+        return ConditionalDetrSegmentationOutput(
             loss=loss,
             loss_dict=loss_dict,
             logits=logits,
@@ -2013,8 +2013,8 @@ def _expand(tensor, length: int):
 
 
 # taken from https://github.com/facebookresearch/detr/blob/master/models/segmentation.py
-# Copied from transformers.models.detr.modeling_detr.DetrMaskHeadSmallConv with Detr->ConditionalDETR
-class ConditionalDETRMaskHeadSmallConv(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrMaskHeadSmallConv with Detr->ConditionalDetr
+class ConditionalDetrMaskHeadSmallConv(nn.Module):
     """
     Simple convolutional head, using group norm. Upsampling is done using a FPN approach
     """
@@ -2094,8 +2094,8 @@ def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
         return x
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->ConditionalDETR
-class ConditionalDETRMHAttentionMap(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->ConditionalDetr
+class ConditionalDetrMHAttentionMap(nn.Module):
     """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
 
     def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
@@ -2173,16 +2173,16 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
 
 
 # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
-class ConditionalDETRLoss(nn.Module):
+class ConditionalDetrLoss(nn.Module):
     """
-    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process
+    This class computes the losses for ConditionalDetrForObjectDetection/ConditionalDetrForSegmentation. The process
     happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2)
     we supervise each pair of matched ground-truth / prediction (supervise class and box).
 
 
 
     Args:
-        matcher (`ConditionalDETRHungarianMatcher`):
+        matcher (`ConditionalDetrHungarianMatcher`):
             Module able to compute a matching between targets and proposals.
         num_classes (`int`):
             Number of object categories, omitting the special no-object category.
@@ -2375,7 +2375,7 @@ def forward(self, outputs, targets):
 
 
 # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
-class ConditionalDETRMLPPredictionHead(nn.Module):
+class ConditionalDetrMLPPredictionHead(nn.Module):
     """
     Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
     height and width of a bounding box w.r.t. an image.
@@ -2397,7 +2397,7 @@ def forward(self, x):
 
 
 # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/matcher.py
-class ConditionalDETRHungarianMatcher(nn.Module):
+class ConditionalDetrHungarianMatcher(nn.Module):
     """
     This class computes an assignment between the targets and the predictions of the network.
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index f61b8dfdda282..d2ec5be33ceb8 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -24,7 +24,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class ConditionalDETRFeatureExtractor(metaclass=DummyObject):
+class ConditionalDetrFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
index b5e19fe005da7..2d21cafc68520 100644
--- a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -32,10 +32,10 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ConditionalDETRFeatureExtractor
+    from transformers import ConditionalDetrFeatureExtractor
 
 
-class ConditionalDETRFeatureExtractionTester(unittest.TestCase):
+class ConditionalDetrFeatureExtractionTester(unittest.TestCase):
     def __init__(
         self,
         parent,
@@ -74,7 +74,7 @@ def prepare_feat_extract_dict(self):
 
     def get_expected_values(self, image_inputs, batched=False):
         """
-        This function computes the expected height and width when providing images to ConditionalDETRFeatureExtractor,
+        This function computes the expected height and width when providing images to ConditionalDetrFeatureExtractor,
         assuming do_resize is set to True with a scalar size.
         """
         if not batched:
@@ -106,12 +106,12 @@ def get_expected_values(self, image_inputs, batched=False):
 
 @require_torch
 @require_vision
-class ConditionalDETRFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+class ConditionalDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
 
-    feature_extraction_class = ConditionalDETRFeatureExtractor if is_vision_available() else None
+    feature_extraction_class = ConditionalDetrFeatureExtractor if is_vision_available() else None
 
     def setUp(self):
-        self.feature_extract_tester = ConditionalDETRFeatureExtractionTester(self)
+        self.feature_extract_tester = ConditionalDetrFeatureExtractionTester(self)
 
     @property
     def feat_extract_dict(self):
@@ -253,7 +253,7 @@ def test_call_pytorch_with_coco_detection_annotations(self):
         target = {"image_id": 39769, "annotations": target}
 
         # encode them
-        feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
         encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
 
         # verify pixel values
@@ -300,7 +300,7 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
 
         # encode them
         # TODO replace by .from_pretrained Atten4Vis/ConditionalDETR-panoptic
-        feature_extractor = ConditionalDETRFeatureExtractor(format="coco_panoptic")
+        feature_extractor = ConditionalDetrFeatureExtractor(format="coco_panoptic")
         encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
 
         # verify pixel values
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index 38933f26c6627..fd008b9af494f 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -19,7 +19,7 @@
 import math
 import unittest
 
-from transformers import ConditionalDETRConfig, is_timm_available, is_vision_available
+from transformers import ConditionalDetrConfig, is_timm_available, is_vision_available
 from transformers.testing_utils import require_timm, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
@@ -31,16 +31,16 @@
 if is_timm_available():
     import torch
 
-    from transformers import ConditionalDETRForObjectDetection, ConditionalDETRForSegmentation, ConditionalDETRModel
+    from transformers import ConditionalDetrForObjectDetection, ConditionalDetrForSegmentation, ConditionalDetrModel
 
 
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ConditionalDETRFeatureExtractor
+    from transformers import ConditionalDetrFeatureExtractor
 
 
-class ConditionalDETRModelTester:
+class ConditionalDetrModelTester:
     def __init__(
         self,
         parent,
@@ -105,7 +105,7 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
-        return ConditionalDETRConfig(
+        return ConditionalDetrConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
             decoder_layers=self.num_hidden_layers,
@@ -125,7 +125,7 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
     def create_and_check_conditional_detr_model(self, config, pixel_values, pixel_mask, labels):
-        model = ConditionalDETRModel(config=config)
+        model = ConditionalDetrModel(config=config)
         model.to(torch_device)
         model.eval()
 
@@ -137,7 +137,7 @@ def create_and_check_conditional_detr_model(self, config, pixel_values, pixel_ma
         )
 
     def create_and_check_conditional_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
-        model = ConditionalDETRForObjectDetection(config=config)
+        model = ConditionalDetrForObjectDetection(config=config)
         model.to(torch_device)
         model.eval()
 
@@ -155,12 +155,12 @@ def create_and_check_conditional_detr_object_detection_head_model(self, config,
 
 
 @require_timm
-class ConditionalDETRModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
-            ConditionalDETRModel,
-            ConditionalDETRForObjectDetection,
-            ConditionalDETRForSegmentation,
+            ConditionalDetrModel,
+            ConditionalDetrForObjectDetection,
+            ConditionalDetrForSegmentation,
         )
         if is_timm_available()
         else ()
@@ -176,7 +176,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class.__name__ in ["ConditionalDETRForObjectDetection", "ConditionalDETRForSegmentation"]:
+            if model_class.__name__ in ["ConditionalDetrForObjectDetection", "ConditionalDetrForSegmentation"]:
                 labels = []
                 for i in range(self.model_tester.batch_size):
                     target = {}
@@ -199,8 +199,8 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         return inputs_dict
 
     def setUp(self):
-        self.model_tester = ConditionalDETRModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ConditionalDETRConfig, has_text_modality=False)
+        self.model_tester = ConditionalDetrModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ConditionalDetrConfig, has_text_modality=False)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -279,10 +279,10 @@ def test_attention_outputs(self):
                 if "labels" in inputs_dict:
                     correct_outlen += 1  # loss is added to beginning
                 # Object Detection model returns pred_logits and pred_boxes
-                if model_class.__name__ == "ConditionalDETRForObjectDetection":
+                if model_class.__name__ == "ConditionalDetrForObjectDetection":
                     correct_outlen += 1
                 # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks
-                if model_class.__name__ == "ConditionalDETRForSegmentation":
+                if model_class.__name__ == "ConditionalDetrForSegmentation":
                     correct_outlen += 2
                 if "past_key_values" in outputs:
                     correct_outlen += 1  # past_key_values have been returned
@@ -406,7 +406,7 @@ def test_different_timm_backbone(self):
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
-            if model_class.__name__ == "ConditionalDETRForObjectDetection":
+            if model_class.__name__ == "ConditionalDetrForObjectDetection":
                 expected_shape = (
                     self.model_tester.batch_size,
                     self.model_tester.num_queries,
@@ -452,17 +452,17 @@ def prepare_img():
 @require_timm
 @require_vision
 @slow
-class ConditionalDETRModelIntegrationTests(unittest.TestCase):
+class ConditionalDetrModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
         return (
-            ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+            ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
             if is_vision_available()
             else None
         )
 
     def test_inference_no_head(self):
-        model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -479,7 +479,7 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_inference_object_detection_head(self):
-        model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -505,7 +505,7 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
     def test_inference_panoptic_segmentation_head(self):
-        model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
+        model = ConditionalDetrForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
diff --git a/utils/check_repo.py b/utils/check_repo.py
index ffc47aa72af8f..8659f795602e2 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -60,8 +60,8 @@
     "DetrEncoder",  # Building part of bigger (tested) model.
     "DetrDecoder",  # Building part of bigger (tested) model.
     "DetrDecoderWrapper",  # Building part of bigger (tested) model.
-    "ConditionalDETREncoder",  # Building part of bigger (tested) model.
-    "ConditionalDETRDecoder",  # Building part of bigger (tested) model.
+    "ConditionalDetrEncoder",  # Building part of bigger (tested) model.
+    "ConditionalDetrDecoder",  # Building part of bigger (tested) model.
     "M2M100Encoder",  # Building part of bigger (tested) model.
     "M2M100Decoder",  # Building part of bigger (tested) model.
     "MCTCTEncoder",  # Building part of bigger (tested) model.

From 8064c37505b4f3eb04b186b5982f74fa0946ef4f Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 11:23:00 -0400
Subject: [PATCH 062/233] fixed docs

---
 README.md                        | 1 -
 README_zh-hans.md                | 1 -
 README_zh-hant.md                | 1 -
 docs/source/en/index.mdx         | 3 +--
 docs/source/en/serialization.mdx | 2 +-
 5 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index b6bc04d5b2613..fcdae9122142b 100644
--- a/README.md
+++ b/README.md
@@ -277,7 +277,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 96f71b0897bf9..4b6f8f00bdadb 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -253,7 +253,6 @@ conda install -c huggingface transformers
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 发布。
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 69fa1dc0b9b2b..8b7e17557edec 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -265,7 +265,6 @@ conda install -c huggingface transformers
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 579c6172d6a6b..40685a5c2fa8d 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -69,7 +69,6 @@ The documentation is organized into five sections:
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
@@ -217,7 +216,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|      conditional_detr       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|      Conditional DETR       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index f6304004b62ae..a1577447f7235 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -57,7 +57,7 @@ Ready-made configurations include the following architectures:
 - CamemBERT
 - CLIP
 - CodeGen
-- conditional_detr
+- Conditional DETR
 - ConvBERT
 - ConvNeXT
 - Data2VecText

From 94f63a0d14b5e13fc7eef9a14d07a0ac6d5da77c Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 21:50:18 -0400
Subject: [PATCH 063/233] Update README_ko.md

---
 README_ko.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README_ko.md b/README_ko.md
index b4eddfbbb6563..12e65154cb4a1 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -229,7 +229,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.

From 97a9792b0d34e8961a90ce7615dffe0b1bc437d1 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 12 Sep 2022 21:20:11 -0400
Subject: [PATCH 064/233] added spatial_model_name

---
 utils/check_copies.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/check_copies.py b/utils/check_copies.py
index 7d57173654468..0c8afa7b77ae8 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -477,6 +477,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "OpenAI GPT": "GPT",
     "Perceiver": "Perceiver IO",
     "ViT": "Vision Transformer (ViT)",
+    "conditional_detr": "Conditional DETR"
 }
 
 # Update this list with the models that shouldn't be in the README. This only concerns modular models or those who do

From f2d758edc74a24fbbe528407f65f39569b760983 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 12 Sep 2022 21:24:51 -0400
Subject: [PATCH 065/233] fixed fix-copies

---
 src/transformers/models/auto/configuration_auto.py | 2 +-
 utils/check_copies.py                              | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 258fb5d645bf1..dce73cb390365 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -302,7 +302,7 @@
         ("canine", "CANINE"),
         ("clip", "CLIP"),
         ("codegen", "CodeGen"),
-        ("conditional_detr", "conditional_detr"),
+        ("conditional_detr", "Conditional DETR"),
         ("convbert", "ConvBERT"),
         ("convnext", "ConvNeXT"),
         ("cpm", "CPM"),
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 0c8afa7b77ae8..7d57173654468 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -477,7 +477,6 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "OpenAI GPT": "GPT",
     "Perceiver": "Perceiver IO",
     "ViT": "Vision Transformer (ViT)",
-    "conditional_detr": "Conditional DETR"
 }
 
 # Update this list with the models that shouldn't be in the README. This only concerns modular models or those who do

From d00e8c5ad3486cc69d8bd91b05b6825f3afe7531 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:44:17 -0400
Subject: [PATCH 066/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index ec324f1f72456..f072310f7f878 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for CONDITIONAL_DETR."""
+"""Feature extractor class for Conditional DETR."""
 
 import io
 import pathlib

From f29393b4e5403d83329edacf0ae1673ce2bdd604 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:44:58 -0400
Subject: [PATCH 067/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index f072310f7f878..bb000a7996b3c 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -37,7 +37,7 @@
 ImageInput = Union[Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"]]
 
 
-# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+# Copied from transformers.models.detr.feature_extraction_detr.center_to_corners_format
 def center_to_corners_format(x):
     """
     Converts a PyTorch tensor of bounding boxes of center format (center_x, center_y, width, height) to corners format

From b22a763adda642b2a161afda553c8fb83fb6a694 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:45:19 -0400
Subject: [PATCH 068/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index bb000a7996b3c..d89f3154e0bf0 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -48,6 +48,7 @@ def center_to_corners_format(x):
     return torch.stack(b, dim=-1)
 
 
+# Copied from transformers.models.detr.feature_extraction_detr.corners_to_center_format
 def corners_to_center_format(x):
     """
     Converts a NumPy array of bounding boxes of shape (number of bounding boxes, 4) of corners format (x_0, y_0, x_1,

From 14fa3137b369d47503bc385f8c581fbbb150bc4c Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:45:43 -0400
Subject: [PATCH 069/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index d89f3154e0bf0..d813ab2a45e3c 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -60,6 +60,7 @@ def corners_to_center_format(x):
     return np.stack(b, axis=-1)
 
 
+# Copied from transformers.models.detr.feature_extraction_detr.masks_to_boxes
 def masks_to_boxes(masks):
     """
     Compute the bounding boxes around the provided panoptic segmentation masks.

From 4032246617f239be6c4ac518291f977a8d77e415 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:46:05 -0400
Subject: [PATCH 070/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 05343a89f3ed0..3bf47082f6b30 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Attn4Vis and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 Microsoft Research Asia and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 318ac68ec9106d386018e6e7017ae182aa36a3e8 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:46:25 -0400
Subject: [PATCH 071/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 3bf47082f6b30..7d45259b60adc 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -57,7 +57,7 @@
 _CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
 
 CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "Atten4Vis/ConditionalDETR",
+    "microsoft/conditional-detr-resnet-50",
     # See all conditional_detr models at https://huggingface.co/models?filter=conditional_detr
 ]
 

From 1a9c9204a5374db062f7bf2b30a5d99acd750567 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:51:20 -0400
Subject: [PATCH 072/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 7d45259b60adc..bf4d82479375e 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -270,8 +270,6 @@ class ConditionalDetrSegmentationOutput(ModelOutput):
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-# BELOW: utilities copied from
-# https://github.com/facebookresearch/detr/blob/master/backbone.py
 # Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDetr
 class ConditionalDetrFrozenBatchNorm2d(nn.Module):
     """

From 389b28c41ba6c0fa5ab65f2a1c52ed94cae6bae1 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:52:46 -0400
Subject: [PATCH 073/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index bf4d82479375e..8550bba968802 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -326,6 +326,7 @@ def replace_batch_norm(m, name=""):
         replace_batch_norm(ch, n)
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrTimmConvEncoder
 class ConditionalDetrTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.

From 41395f299bb9791cb116a2f4bc0b6cde716f424a Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:54:05 -0400
Subject: [PATCH 074/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 8550bba968802..415f0d47a41f5 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -503,6 +503,7 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrAttention with DETR->Conditional DETR
 class DetrAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.

From c3e3ff174dcd5cba0301def72d095cb6251e9a12 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:56:48 -0400
Subject: [PATCH 075/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 415f0d47a41f5..be07eca130e23 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1112,7 +1112,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDetr
+# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL DETR,Detr->ConditionalDetr
 class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a

From ce3498014859911d6e961cc2a50143a50c339726 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:57:04 -0400
Subject: [PATCH 076/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index be07eca130e23..00faac56fa1de 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1228,7 +1228,7 @@ class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
 
     The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
 
-    Some small tweaks for CONDITIONAL_DETR:
+    Some small tweaks for Conditional DETR:
 
     - position_embeddings and query_position_embeddings are added to the forward pass.
     - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.

From 5e5b6d43ef668a2c27d5441675b4b4c774be35c2 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:57:19 -0400
Subject: [PATCH 077/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 00faac56fa1de..849502de9323a 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1243,7 +1243,7 @@ def __init__(self, config: ConditionalDetrConfig):
         self.layerdrop = config.decoder_layerdrop
 
         self.layers = nn.ModuleList([ConditionalDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
-        # in CONDITIONAL_DETR, the decoder uses layernorm after the last decoder layer output
+        # in Conditional DETR, the decoder uses layernorm after the last decoder layer output
         self.layernorm = nn.LayerNorm(config.d_model)
         d_model = config.d_model
         self.gradient_checkpointing = False

From 5e0259448d7eda7d2267486c2806bf8c288883f1 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:57:42 -0400
Subject: [PATCH 078/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 849502de9323a..a063c1c2e46e4 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1437,7 +1437,7 @@ def custom_forward(*inputs):
 
 @add_start_docstrings(
     """
-    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+    The bare Conditional DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
     hidden-states without any specific head on top.
     """,
     CONDITIONAL_DETR_START_DOCSTRING,

From 0f28629bff5ba7c15705f09f0e57e818201d9dca Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 10:26:13 -0400
Subject: [PATCH 079/233] added some copied from

---
 .../feature_extraction_conditional_detr.py    | 29 +++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index d813ab2a45e3c..1421e82aa789a 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -94,9 +94,7 @@ def masks_to_boxes(masks):
     return np.stack([x_min, y_min, x_max, y_max], 1)
 
 
-# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
-# Copyright (c) 2018, Alexander Kirillov
-# All rights reserved.
+# Copied from transformers.models.detr.feature_extraction_detr.rgb_to_id
 def rgb_to_id(color):
     if isinstance(color, np.ndarray) and len(color.shape) == 3:
         if color.dtype == np.uint8:
@@ -104,7 +102,7 @@ def rgb_to_id(color):
         return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
-
+# Copied from transformers.models.detr.feature_extraction_detr.id_to_rgb
 def id_to_rgb(id_map):
     if isinstance(id_map, np.ndarray):
         id_map_copy = id_map.copy()
@@ -123,7 +121,7 @@ def id_to_rgb(id_map):
 
 class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
-    Constructs a CONDITIONAL_DETR feature extractor.
+    Constructs a CONDITIONAL DETR feature extractor.
 
     This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
     should refer to this superclass for more information regarding those methods.
@@ -188,7 +186,7 @@ def prepare(self, image, target, return_segmentation_masks=False, masks_path=Non
         else:
             raise ValueError(f"Format {self.format} not supported")
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.convert_coco_poly_to_mask
     def convert_coco_poly_to_mask(self, segmentations, height, width):
 
         try:
@@ -212,10 +210,10 @@ def convert_coco_poly_to_mask(self, segmentations, height, width):
 
         return masks
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_detection with DETR->Conditional DETR
     def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
         """
-        Convert the target in COCO format into the format expected by CONDITIONAL_DETR.
+        Convert the target in COCO format into the format expected by CONDITIONAL DETR.
         """
         w, h = image.size
 
@@ -277,6 +275,7 @@ def prepare_coco_detection(self, image, target, return_segmentation_masks=False)
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_panoptic
     def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
         w, h = image.size
         ann_info = target.copy()
@@ -310,6 +309,7 @@ def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
     def _resize(self, image, size, target=None, max_size=None):
         """
         Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
@@ -380,6 +380,7 @@ def get_size(image_size, size, max_size=None):
 
         return rescaled_image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
     def _normalize(self, image, mean, std, target=None):
         """
         Normalize the image with a certain mean and std.
@@ -401,7 +402,8 @@ def _normalize(self, image, mean, std, target=None):
             target["boxes"] = boxes
 
         return image, target
-
+    
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._call with DETR->Conditional DETR,Detr->ConditionalDetr
     def __call__(
         self,
         images: ImageInput,
@@ -549,7 +551,7 @@ def __call__(
             if annotations is not None:
                 annotations = [annotations]
 
-        # prepare (COCO annotations as a list of Dict -> CONDITIONAL_DETR target as a single Dict per image)
+        # prepare (COCO annotations as a list of Dict -> CONDITIONAL DETR target as a single Dict per image)
         if annotations is not None:
             for idx, (image, target) in enumerate(zip(images, annotations)):
                 if not isinstance(image, Image.Image):
@@ -624,6 +626,7 @@ def __call__(
 
         return encoded_inputs
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
     def _max_by_axis(self, the_list):
         # type: (List[List[int]]) -> List[int]
         maxes = the_list[0]
@@ -632,6 +635,7 @@ def _max_by_axis(self, the_list):
                 maxes[index] = max(maxes[index], item)
         return maxes
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.pad_and_create_pixel_mask
     def pad_and_create_pixel_mask(
         self, pixel_values_list: List["torch.Tensor"], return_tensors: Optional[Union[str, TensorType]] = None
     ):
@@ -717,6 +721,7 @@ def post_process(self, outputs, target_sizes):
 
         return results
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_segmentation with Detr->ConditionalDetr
     def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
         """
         Converts the output of [`ConditionalDetrForSegmentation`] into image segmentation predictions. Only supports
@@ -759,7 +764,7 @@ def to_tuple(tup):
             preds.append(predictions)
         return preds
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_instance with Detr->ConditionalDetr
     def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
         """
         Converts the output of [`ConditionalDetrForSegmentation`] into actual instance segmentation predictions. Only
@@ -803,7 +808,7 @@ def post_process_instance(self, results, outputs, orig_target_sizes, max_target_
 
         return results
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_panoptic with Detr->ConditionalDetr
     def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
         """
         Converts the output of [`ConditionalDetrForSegmentation`] into actual panoptic predictions. Only supports

From 727ed2fe49510eb210890ebfdb0099c794e97dda Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 10:27:39 -0400
Subject: [PATCH 080/233] added some copied from

---
 .../conditional_detr/feature_extraction_conditional_detr.py   | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 1421e82aa789a..812a0fe103cea 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -309,7 +309,6 @@ def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
 
         return image, target
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
     def _resize(self, image, size, target=None, max_size=None):
         """
         Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
@@ -380,7 +379,6 @@ def get_size(image_size, size, max_size=None):
 
         return rescaled_image, target
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
     def _normalize(self, image, mean, std, target=None):
         """
         Normalize the image with a certain mean and std.
@@ -403,7 +401,6 @@ def _normalize(self, image, mean, std, target=None):
 
         return image, target
     
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._call with DETR->Conditional DETR,Detr->ConditionalDetr
     def __call__(
         self,
         images: ImageInput,
@@ -626,7 +623,6 @@ def __call__(
 
         return encoded_inputs
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
     def _max_by_axis(self, the_list):
         # type: (List[List[int]]) -> List[int]
         maxes = the_list[0]

From 7aa2fbb061d9477127562184fb1d7c83a5bd95b1 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 11:40:45 -0400
Subject: [PATCH 081/233] added some copied from

---
 .../feature_extraction_conditional_detr.py    |  3 +-
 .../modeling_conditional_detr.py              | 52 +++++++++++--------
 ...est_feature_extraction_conditional_detr.py |  4 +-
 .../test_modeling_conditional_detr.py         |  8 +--
 4 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 812a0fe103cea..c9f5d8124e539 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -210,10 +210,9 @@ def convert_coco_poly_to_mask(self, segmentations, height, width):
 
         return masks
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_detection with DETR->Conditional DETR
     def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
         """
-        Convert the target in COCO format into the format expected by CONDITIONAL DETR.
+        Convert the target in COCO format into the format expected by Conditional DETR.
         """
         w, h = image.size
 
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index a063c1c2e46e4..435a80b220b3b 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -54,11 +54,11 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "ConditionalDetrConfig"
-_CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
+_CHECKPOINT_FOR_DOC = "microsoft/conditional-detr-resnet-50"
 
 CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "microsoft/conditional-detr-resnet-50",
-    # See all conditional_detr models at https://huggingface.co/models?filter=conditional_detr
+    # See all Conditional DETR models at https://huggingface.co/models?filter=conditional_detr
 ]
 
 
@@ -331,11 +331,11 @@ class ConditionalDetrTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.
 
-    nn.BatchNorm2d layers are replaced by ConditionalDetrFrozenBatchNorm2d as defined above.
+    nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.
 
     """
 
-    def __init__(self, name: str, dilation: bool):
+    def __init__(self, name: str, dilation: bool, use_pretrained_backbone: bool, num_channels: int = 3):
         super().__init__()
 
         kwargs = {}
@@ -344,7 +344,14 @@ def __init__(self, name: str, dilation: bool):
 
         requires_backends(self, ["timm"])
 
-        backbone = create_model(name, pretrained=True, features_only=True, out_indices=(1, 2, 3, 4), **kwargs)
+        backbone = create_model(
+            name,
+            pretrained=use_pretrained_backbone,
+            features_only=True,
+            out_indices=(1, 2, 3, 4),
+            in_chans=num_channels,
+            **kwargs,
+        )
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
@@ -482,7 +489,7 @@ def build_position_encoding(config):
 
 
 # function to generate sine positional embedding for 2d coordinates
-def gen_sineembed_for_position(pos_tensor):
+def gen_sine_position_embeddings(pos_tensor):
     scale = 2 * math.pi
     dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
     dim_t = 10000 ** (2 * (dim_t // 2) / 128)
@@ -503,12 +510,11 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrAttention with DETR->Conditional DETR
 class DetrAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
 
-    Here, we add position embeddings to the queries and keys (as explained in the CONDITIONAL_DETR paper).
+    Here, we add position embeddings to the queries and keys (as explained in the Conditional DETR paper).
     """
 
     def __init__(
@@ -1112,7 +1118,6 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL DETR,Detr->ConditionalDetr
 class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
@@ -1120,7 +1125,7 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
 
     The encoder updates the flattened feature map through multiple self-attention layers.
 
-    Small tweak for CONDITIONAL_DETR:
+    Small tweak for Conditional DETR:
 
     - position_embeddings are added to the forward pass.
 
@@ -1136,7 +1141,7 @@ def __init__(self, config: ConditionalDetrConfig):
 
         self.layers = nn.ModuleList([ConditionalDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
 
-        # in the original CONDITIONAL_DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
+        # in the original Conditional DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1341,7 +1346,7 @@ def forward(
         reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
         obj_center = reference_points[..., :2].transpose(0, 1)
         # get sine embedding for the query vector
-        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center)
+        query_sine_embed_before_transformation = gen_sine_position_embeddings(obj_center)
 
         for idx, decoder_layer in enumerate(self.layers):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1503,8 +1508,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
+        >>> model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50")
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
@@ -1606,7 +1611,7 @@ class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
     def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
-        # CONDITIONAL_DETR encoder-decoder model
+        # CONDITIONAL DETR encoder-decoder model
         self.model = ConditionalDetrModel(config)
 
         # Object detection heads
@@ -1662,8 +1667,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
+        >>> model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -2374,7 +2379,7 @@ def forward(self, outputs, targets):
         return losses
 
 
-# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->ConditionalDetr
 class ConditionalDetrMLPPredictionHead(nn.Module):
     """
     Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
@@ -2396,7 +2401,6 @@ def forward(self, x):
         return x
 
 
-# taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/matcher.py
 class ConditionalDetrHungarianMatcher(nn.Module):
     """
     This class computes an assignment between the targets and the predictions of the network.
@@ -2421,7 +2425,7 @@ def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float
         self.class_cost = class_cost
         self.bbox_cost = bbox_cost
         self.giou_cost = giou_cost
-        if class_cost == 0 or bbox_cost == 0 or giou_cost == 0:
+        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
             raise ValueError("All costs of the Matcher can't be 0")
 
     @torch.no_grad()
@@ -2488,6 +2492,7 @@ def _upcast(t: Tensor) -> Tensor:
         return t if t.dtype in (torch.int32, torch.int64) else t.int()
 
 
+# Copied from transformers.models.detr.modeling_detr.box_area
 def box_area(boxes: Tensor) -> Tensor:
     """
     Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
@@ -2504,7 +2509,7 @@ def box_area(boxes: Tensor) -> Tensor:
     return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
 
 
-# modified from torchvision to also return the union
+# Copied from transformers.models.detr.modeling_detr.box_iou
 def box_iou(boxes1, boxes2):
     area1 = box_area(boxes1)
     area2 = box_area(boxes2)
@@ -2521,6 +2526,7 @@ def box_iou(boxes1, boxes2):
     return iou, union
 
 
+# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
 def generalized_box_iou(boxes1, boxes2):
     """
     Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
@@ -2554,7 +2560,7 @@ def _max_by_axis(the_list):
             maxes[index] = max(maxes[index], item)
     return maxes
 
-
+# Copied from transformers.models.detr.modeling_detr.NestedTensor
 class NestedTensor(object):
     def __init__(self, tensors, mask: Optional[Tensor]):
         self.tensors = tensors
@@ -2575,7 +2581,7 @@ def decompose(self):
     def __repr__(self):
         return str(self.tensors)
 
-
+# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
 def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     if tensor_list[0].ndim == 3:
         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
index 2d21cafc68520..371d97de0000b 100644
--- a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -253,7 +253,7 @@ def test_call_pytorch_with_coco_detection_annotations(self):
         target = {"image_id": 39769, "annotations": target}
 
         # encode them
-        feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
         encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
 
         # verify pixel values
@@ -299,7 +299,7 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
         masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
 
         # encode them
-        # TODO replace by .from_pretrained Atten4Vis/ConditionalDETR-panoptic
+        # TODO replace by .from_pretrained microsoft/conditional-detr-resnet-50-panoptic
         feature_extractor = ConditionalDetrFeatureExtractor(format="coco_panoptic")
         encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
 
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index fd008b9af494f..2fb4bb562d5bf 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -456,13 +456,13 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
         return (
-            ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+            ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
             if is_vision_available()
             else None
         )
 
     def test_inference_no_head(self):
-        model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -479,7 +479,7 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_inference_object_detection_head(self):
-        model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -505,7 +505,7 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
     def test_inference_panoptic_segmentation_head(self):
-        model = ConditionalDetrForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
+        model = ConditionalDetrForSegmentation.from_pretrained("microsoft/conditional-detr-resnet-50-panoptic").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()

From fc6c71e8d54e28dcfc524d405f225a8cea1b643f Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 11:43:10 -0400
Subject: [PATCH 082/233] added some copied from

---
 .../feature_extraction_conditional_detr.py                | 3 ++-
 .../models/conditional_detr/modeling_conditional_detr.py  | 2 ++
 .../conditional_detr/test_modeling_conditional_detr.py    | 8 ++++++--
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index c9f5d8124e539..922cfaf09e98e 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -102,6 +102,7 @@ def rgb_to_id(color):
         return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
+
 # Copied from transformers.models.detr.feature_extraction_detr.id_to_rgb
 def id_to_rgb(id_map):
     if isinstance(id_map, np.ndarray):
@@ -399,7 +400,7 @@ def _normalize(self, image, mean, std, target=None):
             target["boxes"] = boxes
 
         return image, target
-    
+
     def __call__(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 435a80b220b3b..a24a306b65e9f 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -2560,6 +2560,7 @@ def _max_by_axis(the_list):
             maxes[index] = max(maxes[index], item)
     return maxes
 
+
 # Copied from transformers.models.detr.modeling_detr.NestedTensor
 class NestedTensor(object):
     def __init__(self, tensors, mask: Optional[Tensor]):
@@ -2581,6 +2582,7 @@ def decompose(self):
     def __repr__(self):
         return str(self.tensors)
 
+
 # Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
 def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     if tensor_list[0].ndim == 3:
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index 2fb4bb562d5bf..a67f0bc30638e 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -479,7 +479,9 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_inference_object_detection_head(self):
-        model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)
+        model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50").to(
+            torch_device
+        )
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -505,7 +507,9 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
     def test_inference_panoptic_segmentation_head(self):
-        model = ConditionalDetrForSegmentation.from_pretrained("microsoft/conditional-detr-resnet-50-panoptic").to(torch_device)
+        model = ConditionalDetrForSegmentation.from_pretrained("microsoft/conditional-detr-resnet-50-panoptic").to(
+            torch_device
+        )
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()

From a0d02aa3cbd4ac6f911377bc7d775ffec23450c9 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 14:52:49 -0400
Subject: [PATCH 083/233] fixed use_pretrained issue

---
 .../models/conditional_detr/configuration_conditional_detr.py | 4 ++++
 .../models/conditional_detr/modeling_conditional_detr.py      | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 6abcf85894735..df6673e9ebfef 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -134,6 +134,7 @@ class ConditionalDetrConfig(PretrainedConfig):
 
     def __init__(
         self,
+        num_channels=3,
         num_queries=300,
         max_position_embeddings=1024,
         encoder_layers=6,
@@ -157,6 +158,7 @@ def __init__(
         auxiliary_loss=False,
         position_embedding_type="sine",
         backbone="resnet50",
+        use_pretrained_backbone=True,
         dilation=False,
         class_cost=2,
         bbox_cost=5,
@@ -169,6 +171,7 @@ def __init__(
         focal_alpha=0.25,
         **kwargs
     ):
+        self.num_channels = num_channels
         self.num_queries = num_queries
         self.max_position_embeddings = max_position_embeddings
         self.d_model = d_model
@@ -191,6 +194,7 @@ def __init__(
         self.auxiliary_loss = auxiliary_loss
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
         self.dilation = dilation
         # Hungarian matcher
         self.class_cost = class_cost
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index a24a306b65e9f..9e70e1b873cd6 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1452,7 +1452,9 @@ def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # Create backbone + positional encoding
-        backbone = ConditionalDetrTimmConvEncoder(config.backbone, config.dilation)
+        backbone = ConditionalDetrTimmConvEncoder(
+            config.backbone, config.dilation, config.use_pretrained_backbone, config.num_channels
+        )
         position_embeddings = build_position_encoding(config)
         self.backbone = ConditionalDetrConvModel(backbone, position_embeddings)
 

From 3f2cd47cca4867b951d1fc8212c0cd6c62b90749 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 16 Sep 2022 10:35:52 -0400
Subject: [PATCH 084/233] changed post-process

---
 .../conditional_detr/feature_extraction_conditional_detr.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 922cfaf09e98e..be85334d7b354 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -701,7 +701,7 @@ def post_process(self, outputs, target_sizes):
             raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
 
         prob = out_logits.sigmoid()
-        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 300, dim=1)
         scores = topk_values
         topk_boxes = topk_indexes // out_logits.shape[2]
         labels = topk_indexes % out_logits.shape[2]

From 0bfc87aaced33ac35832d5af0aae0515bee96780 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 19 Sep 2022 11:05:44 -0400
Subject: [PATCH 085/233] fix style quality and copies

---
 src/transformers/__init__.py                  | 12 ++---
 src/transformers/models/auto/modeling_auto.py |  2 +-
 .../modeling_conditional_detr.py              | 44 ++++++++++---------
 3 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index af17f29e0ded3..6abc53c85008e 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3540,12 +3540,6 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_timm_and_vision_objects import *
     else:
-        from .models.deformable_detr import (
-            DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            DeformableDetrForObjectDetection,
-            DeformableDetrModel,
-            DeformableDetrPreTrainedModel,
-        )
         from .models.conditional_detr import (
             CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
             ConditionalDetrForObjectDetection,
@@ -3553,6 +3547,12 @@
             ConditionalDetrModel,
             ConditionalDetrPreTrainedModel,
         )
+        from .models.deformable_detr import (
+            DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DeformableDetrForObjectDetection,
+            DeformableDetrModel,
+            DeformableDetrPreTrainedModel,
+        )
         from .models.detr import (
             DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
             DetrForObjectDetection,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 072805d6d74b2..f9152bc6f7822 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -457,8 +457,8 @@
 MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Object Detection mapping
-        ("deformable_detr", "DeformableDetrForObjectDetection"),
         ("conditional_detr", "ConditionalDETRForObjectDetection"),
+        ("deformable_detr", "DeformableDetrForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
     ]
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 9e70e1b873cd6..a8d272f312cd2 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -280,7 +280,7 @@ class ConditionalDetrFrozenBatchNorm2d(nn.Module):
     """
 
     def __init__(self, n):
-        super(ConditionalDetrFrozenBatchNorm2d, self).__init__()
+        super().__init__()
         self.register_buffer("weight", torch.ones(n))
         self.register_buffer("bias", torch.zeros(n))
         self.register_buffer("running_mean", torch.zeros(n))
@@ -293,7 +293,7 @@ def _load_from_state_dict(
         if num_batches_tracked_key in state_dict:
             del state_dict[num_batches_tracked_key]
 
-        super(ConditionalDetrFrozenBatchNorm2d, self)._load_from_state_dict(
+        super()._load_from_state_dict(
             state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
         )
 
@@ -398,14 +398,14 @@ def forward(self, pixel_values, pixel_mask):
 
 
 # Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDetr
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[int] = None):
     """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`.
     """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
+    batch_size, source_len = mask.size()
+    target_len = target_len if target_len is not None else source_len
 
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_len, source_len).to(dtype)
 
     inverted_mask = 1.0 - expanded_mask
 
@@ -462,12 +462,12 @@ def __init__(self, embedding_dim=256):
         self.column_embeddings = nn.Embedding(50, embedding_dim)
 
     def forward(self, pixel_values, pixel_mask=None):
-        h, w = pixel_values.shape[-2:]
-        i = torch.arange(w, device=pixel_values.device)
-        j = torch.arange(h, device=pixel_values.device)
-        x_emb = self.column_embeddings(i)
-        y_emb = self.row_embeddings(j)
-        pos = torch.cat([x_emb.unsqueeze(0).repeat(h, 1, 1), y_emb.unsqueeze(1).repeat(1, w, 1)], dim=-1)
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
         pos = pos.permute(2, 0, 1)
         pos = pos.unsqueeze(0)
         pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
@@ -2538,15 +2538,17 @@ def generalized_box_iou(boxes1, boxes2):
     """
     # degenerate boxes gives inf / nan results
     # so do an early check
-    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
-    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
+        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
+    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
+        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
     iou, union = box_iou(boxes1, boxes2)
 
-    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
-    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 
-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
-    area = wh[:, :, 0] * wh[:, :, 1]
+    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
+    area = width_height[:, :, 0] * width_height[:, :, 1]
 
     return iou - (area - union) / area
 
@@ -2590,11 +2592,11 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     if tensor_list[0].ndim == 3:
         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
         batch_shape = [len(tensor_list)] + max_size
-        b, c, h, w = batch_shape
+        batch_size, num_channels, height, width = batch_shape
         dtype = tensor_list[0].dtype
         device = tensor_list[0].device
         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
-        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
         for img, pad_img, m in zip(tensor_list, tensor, mask):
             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
             m[: img.shape[1], : img.shape[2]] = False

From 3d3d78f3f8cf9315fa5d367e03f357e1b21c3427 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 19 Sep 2022 11:15:45 -0400
Subject: [PATCH 086/233] fix style quality and copies

---
 src/transformers/models/auto/modeling_auto.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index f9152bc6f7822..8ee36125de25d 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -457,7 +457,7 @@
 MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Object Detection mapping
-        ("conditional_detr", "ConditionalDETRForObjectDetection"),
+        ("conditional_detr", "ConditionalDetrForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),

From 8273131b2dc8b8955ce701f77f0aebaca9639d72 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 19 Sep 2022 14:15:53 -0400
Subject: [PATCH 087/233] fix style quality and copies

---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index a8d272f312cd2..91c1a71d6aa82 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1330,7 +1330,7 @@ def forward(
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, target_len=input_shape[-1])
 
         # optional intermediate hidden states
         intermediate = () if self.config.auxiliary_loss else None

From ce0e01fdf4603a8eba09ee4573179477d449a4c0 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 19 Sep 2022 14:22:24 -0400
Subject: [PATCH 088/233] fix style quality and copies

---
 .../models/conditional_detr/modeling_conditional_detr.py      | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 91c1a71d6aa82..20aa578737d70 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1330,7 +1330,9 @@ def forward(
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, target_len=input_shape[-1])
+            encoder_attention_mask = _expand_mask(
+                encoder_attention_mask, inputs_embeds.dtype, target_len=input_shape[-1]
+            )
 
         # optional intermediate hidden states
         intermediate = () if self.config.auxiliary_loss else None

From 81788ee23cc15bdb95ee7d417633ed73c0703e16 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 10:18:01 -0400
Subject: [PATCH 089/233] add more fix-copies

---
 .../conditional_detr/feature_extraction_conditional_detr.py    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index be85334d7b354..dedaa5ccda3df 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -211,9 +211,10 @@ def convert_coco_poly_to_mask(self, segmentations, height, width):
 
         return masks
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_detection with DETR->ConditionalDETR
     def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
         """
-        Convert the target in COCO format into the format expected by Conditional DETR.
+        Convert the target in COCO format into the format expected by ConditionalDETR.
         """
         w, h = image.size
 

From 7bc57201ebb03501452cbebd28ab8f7a0adb56a1 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 10:36:47 -0400
Subject: [PATCH 090/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index dedaa5ccda3df..ad0ad138c4bb8 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -122,7 +122,7 @@ def id_to_rgb(id_map):
 
 class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
-    Constructs a CONDITIONAL DETR feature extractor.
+    Constructs a Conditional DETR feature extractor.
 
     This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
     should refer to this superclass for more information regarding those methods.

From acbaba0d14d8c47252752f71c5c5f55e13fcfac5 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 11:36:53 -0400
Subject: [PATCH 091/233] fixed some variable names & added more fix-copies

---
 src/transformers/models/auto/modeling_auto.py |  1 -
 .../feature_extraction_conditional_detr.py    | 10 +++++-
 .../modeling_conditional_detr.py              | 26 +++++++-------
 ...est_feature_extraction_conditional_detr.py | 36 +++++++++----------
 .../test_modeling_conditional_detr.py         |  2 +-
 5 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 8ee36125de25d..d7c5f1772f13b 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -374,7 +374,6 @@
     [
         # Do not add new models here, this class will be deprecated in the future.
         # Model for Image Segmentation mapping
-        ("conditional_detr", "ConditionalDetrForSegmentation"),
         ("detr", "DetrForSegmentation"),
     ]
 )
diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index dedaa5ccda3df..f625a034d294b 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -152,6 +152,8 @@ class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtrac
 
     model_input_names = ["pixel_values", "pixel_mask"]
 
+
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.__init__
     def __init__(
         self,
         format="coco_detection",
@@ -172,11 +174,13 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else [0.485, 0.456, 0.406]  # ImageNet mean
         self.image_std = image_std if image_std is not None else [0.229, 0.224, 0.225]  # ImageNet std
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._is_valid_format
     def _is_valid_format(self, format):
         if format not in ["coco_detection", "coco_panoptic"]:
             raise ValueError(f"Format {format} not supported")
         return format
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare
     def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
         if self.format == "coco_detection":
             image, target = self.prepare_coco_detection(image, target, return_segmentation_masks)
@@ -310,6 +314,7 @@ def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
     def _resize(self, image, size, target=None, max_size=None):
         """
         Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
@@ -380,6 +385,7 @@ def get_size(image_size, size, max_size=None):
 
         return rescaled_image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
     def _normalize(self, image, mean, std, target=None):
         """
         Normalize the image with a certain mean and std.
@@ -402,6 +408,7 @@ def _normalize(self, image, mean, std, target=None):
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.__call__ with Detr->ConditionalDetr,DETR->ConditionalDETR
     def __call__(
         self,
         images: ImageInput,
@@ -549,7 +556,7 @@ def __call__(
             if annotations is not None:
                 annotations = [annotations]
 
-        # prepare (COCO annotations as a list of Dict -> CONDITIONAL DETR target as a single Dict per image)
+        # prepare (COCO annotations as a list of Dict -> ConditionalDETR target as a single Dict per image)
         if annotations is not None:
             for idx, (image, target) in enumerate(zip(images, annotations)):
                 if not isinstance(image, Image.Image):
@@ -624,6 +631,7 @@ def __call__(
 
         return encoded_inputs
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
     def _max_by_axis(self, the_list):
         # type: (List[List[int]]) -> List[int]
         maxes = the_list[0]
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 20aa578737d70..188e1ab8f5dde 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -911,8 +911,7 @@ def forward(
         k_pos = self.sa_kpos_proj(query_position_embeddings)
         v = self.sa_v_proj(hidden_states)
 
-        bs, num_queries, n_model = q_content.shape
-        _, hw, _ = k_content.shape
+        _, num_queries, n_model = q_content.shape
 
         q = q_content + q_pos
         k = k_content + k_pos
@@ -936,8 +935,8 @@ def forward(
         k_content = self.ca_kcontent_proj(encoder_hidden_states)
         v = self.ca_v_proj(encoder_hidden_states)
 
-        bs, num_queries, n_model = q_content.shape
-        _, hw, _ = k_content.shape
+        batch_size, num_queries, n_model = q_content.shape
+        _, src_len, _ = k_content.shape
 
         k_pos = self.ca_kpos_proj(position_embeddings)
 
@@ -951,13 +950,13 @@ def forward(
             q = q_content
             k = k_content
 
-        q = q.view(bs, num_queries, self.nhead, n_model // self.nhead)
+        q = q.view(batch_size, num_queries, self.nhead, n_model // self.nhead)
         query_sine_embed = self.ca_qpos_sine_proj(query_sine_embed)
-        query_sine_embed = query_sine_embed.view(bs, num_queries, self.nhead, n_model // self.nhead)
-        q = torch.cat([q, query_sine_embed], dim=3).view(bs, num_queries, n_model * 2)
-        k = k.view(bs, hw, self.nhead, n_model // self.nhead)
-        k_pos = k_pos.view(bs, hw, self.nhead, n_model // self.nhead)
-        k = torch.cat([k, k_pos], dim=3).view(bs, hw, n_model * 2)
+        query_sine_embed = query_sine_embed.view(batch_size, num_queries, self.nhead, n_model // self.nhead)
+        q = torch.cat([q, query_sine_embed], dim=3).view(batch_size, num_queries, n_model * 2)
+        k = k.view(batch_size, src_len, self.nhead, n_model // self.nhead)
+        k_pos = k_pos.view(batch_size, src_len, self.nhead, n_model // self.nhead)
+        k = torch.cat([k, k_pos], dim=3).view(batch_size, src_len, n_model * 2)
 
         # Cross-Attention Block
         cross_attn_weights = None
@@ -1118,6 +1117,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrEncoder with Detr->ConditionalDetr,DETR->ConditionalDETR
 class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
@@ -1125,7 +1125,7 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
 
     The encoder updates the flattened feature map through multiple self-attention layers.
 
-    Small tweak for Conditional DETR:
+    Small tweak for ConditionalDETR:
 
     - position_embeddings are added to the forward pass.
 
@@ -1141,7 +1141,7 @@ def __init__(self, config: ConditionalDetrConfig):
 
         self.layers = nn.ModuleList([ConditionalDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
 
-        # in the original Conditional DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
+        # in the original ConditionalDETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1191,7 +1191,7 @@ def forward(
 
         # expand attention_mask
         if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
             attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
 
         encoder_states = () if output_hidden_states else None
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
index 371d97de0000b..1e99c513f3524 100644
--- a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -240,8 +240,8 @@ def test_equivalence_pad_and_create_pixel_mask(self):
         encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
         encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
 
-        assert torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        assert torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
+        self.assertTrue(torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4))
+        self.assertTrue(torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4))
 
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):
@@ -261,31 +261,31 @@ def test_call_pytorch_with_coco_detection_annotations(self):
         self.assertEqual(encoding["pixel_values"].shape, expected_shape)
 
         expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
 
         # verify area
         expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
-        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
         # verify boxes
         expected_boxes_shape = torch.Size([6, 4])
         self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
         expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
         # verify image_id
         expected_image_id = torch.tensor([39769])
-        assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
         # verify is_crowd
         expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
         # verify class_labels
         expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
-        assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
         # verify orig_size
         expected_orig_size = torch.tensor([480, 640])
-        assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
         # verify size
         expected_size = torch.tensor([800, 1066])
-        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
 
     @slow
     def test_call_pytorch_with_coco_panoptic_annotations(self):
@@ -308,31 +308,31 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
         self.assertEqual(encoding["pixel_values"].shape, expected_shape)
 
         expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
 
         # verify area
         expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
-        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
         # verify boxes
         expected_boxes_shape = torch.Size([6, 4])
         self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
         expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
         # verify image_id
         expected_image_id = torch.tensor([39769])
-        assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
         # verify is_crowd
         expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
         # verify class_labels
         expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
-        assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
         # verify masks
         expected_masks_sum = 822338
         self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
         # verify orig_size
         expected_orig_size = torch.tensor([480, 640])
-        assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
         # verify size
         expected_size = torch.tensor([800, 1066])
-        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index a67f0bc30638e..2c7be4d8f2b56 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -472,7 +472,7 @@ def test_inference_no_head(self):
             outputs = model(**encoding)
 
         expected_shape = torch.Size((1, 300, 256))
-        assert outputs.last_hidden_state.shape == expected_shape
+        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
         expected_slice = torch.tensor(
             [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]]
         ).to(torch_device)

From 8d5498c7d9602ee2b51f0ab00607521c6fbb2aa9 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 11:41:00 -0400
Subject: [PATCH 092/233] fixed some variable names & added more fix-copies

---
 .../feature_extraction_conditional_detr.py                | 1 -
 .../test_feature_extraction_conditional_detr.py           | 8 ++++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index f625a034d294b..74cee04a72c7f 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -152,7 +152,6 @@ class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtrac
 
     model_input_names = ["pixel_values", "pixel_mask"]
 
-
     # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.__init__
     def __init__(
         self,
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
index 1e99c513f3524..ddf7e66ef72fe 100644
--- a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -240,8 +240,12 @@ def test_equivalence_pad_and_create_pixel_mask(self):
         encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
         encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
 
-        self.assertTrue(torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4))
-        self.assertTrue(torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4))
+        self.assertTrue(
+            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
+        )
+        self.assertTrue(
+            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
+        )
 
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):

From 7a6b08484236263f0ebf64fc3d24e694647d785b Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 14:14:02 -0400
Subject: [PATCH 093/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index df6673e9ebfef..2890fd2be39d7 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,8 +27,8 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "microsoft/conditional_detr_resnet50": (
-        "https://huggingface.co/microsoft/conditional_detr_resnet50/resolve/main/config.json"
+    "microsoft/conditional-detr-resnet-50": (
+        "https://huggingface.co/microsoft/conditional_detr_resnet-50/resolve/main/config.json"
     ),
 }
 

From cbfea52f72c9b9e6d0fc7bc038d6c040e59e494a Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 14:18:53 -0400
Subject: [PATCH 094/233] added more copied from

---
 .../conditional_detr/modeling_conditional_detr.py     | 11 +++++++++--
 utils/check_repo.py                                   |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 188e1ab8f5dde..92044c4d30e02 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1013,8 +1013,15 @@ def forward(self, hidden_states: torch.Tensor):
         return hidden_states
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with DetrMLPPredictionHead->MLP
 class MLP(nn.Module):
-    """Very simple multi-layer perceptron (also called FFN)"""
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
 
     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
         super().__init__()
@@ -1024,7 +1031,7 @@ def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
 
     def forward(self, x):
         for i, layer in enumerate(self.layers):
-            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
         return x
 
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 8659f795602e2..988967e797d12 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -167,6 +167,7 @@
     "FlaxCLIPVisionModel",
     "FlaxWav2Vec2ForCTC",
     "DetrForSegmentation",
+    "ConditionalDetrForSegmentation",
     "DPRReader",
     "FlaubertForQuestionAnswering",
     "FlavaImageCodebook",

From 21235962928cd0e33bdcb5bba787df6bad61353d Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 14:25:39 -0400
Subject: [PATCH 095/233] fixed quality

---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 92044c4d30e02..c18a4aaee24bd 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -22,7 +22,6 @@
 
 import torch
 from torch import Tensor, nn
-from torch.nn import functional as F
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput

From 4c1b817f77ffa4be06aaa70d028fe9e75a2521ab Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 14:31:16 -0400
Subject: [PATCH 096/233] changed pretrained config

---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 2890fd2be39d7..afa6426bc3738 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -28,7 +28,7 @@
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "microsoft/conditional-detr-resnet-50": (
-        "https://huggingface.co/microsoft/conditional_detr_resnet-50/resolve/main/config.json"
+        "https://huggingface.co/microsoft/conditional-detr-resnet-50/resolve/main/config.json"
     ),
 }
 

From f8bd9f03ad46c21bad4bd9ddb17133d53ecf8f23 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Wed, 21 Sep 2022 10:25:57 -0400
Subject: [PATCH 097/233] added more copied-from and fixed the issue in
 feature_extraction_auto

---
 .../models/auto/feature_extraction_auto.py    |  2 +-
 .../modeling_conditional_detr.py              | 61 +++++++++++--------
 2 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 015fd132ef0dc..cb75f439c233d 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,6 +39,7 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
+        ("conditional_detr", "ConditionalDetrFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
@@ -46,7 +47,6 @@
         ("deformable_detr", "DetrFeatureExtractor"),
         ("deit", "DeiTFeatureExtractor"),
         ("detr", "DetrFeatureExtractor"),
-        ("detr", "DetrFeatureExtractor"),
         ("donut", "DonutFeatureExtractor"),
         ("dpt", "DPTFeatureExtractor"),
         ("flava", "FlavaFeatureExtractor"),
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index c18a4aaee24bd..79199ce06e428 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -509,11 +509,12 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrAttention
 class DetrAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
 
-    Here, we add position embeddings to the queries and keys (as explained in the Conditional DETR paper).
+    Here, we add position embeddings to the queries and keys (as explained in the DETR paper).
     """
 
     def __init__(
@@ -541,8 +542,8 @@ def __init__(
         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
 
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
     def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
         return tensor if position_embeddings is None else tensor + position_embeddings
@@ -561,7 +562,7 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+        batch_size, target_len, embed_dim = hidden_states.size()
 
         # add position embeddings to the hidden states before projecting to queries and keys
         if position_embeddings is not None:
@@ -578,35 +579,36 @@ def forward(
         # get key, value proj
         if is_cross_attention:
             # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states_original), -1, bsz)
+            key_states = self._shape(self.k_proj(key_value_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(key_value_states_original), -1, batch_size)
         else:
             # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states_original), -1, bsz)
+            key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
 
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
         key_states = key_states.view(*proj_shape)
         value_states = value_states.view(*proj_shape)
 
-        src_len = key_states.size(1)
+        source_len = key_states.size(1)
 
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
 
-        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
             raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
                 f" {attn_weights.size()}"
             )
 
         if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+            if attention_mask.size() != (batch_size, 1, target_len, source_len):
                 raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+                    f" {attention_mask.size()}"
                 )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
@@ -615,8 +617,8 @@ def forward(
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
         else:
             attn_weights_reshaped = None
 
@@ -624,15 +626,15 @@ def forward(
 
         attn_output = torch.bmm(attn_probs, value_states)
 
-        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
@@ -755,6 +757,7 @@ def forward(
         return attn_output, attn_weights_reshaped
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrEncoderLayer with DetrEncoderLayer->ConditionalDetrEncoderLayer,DetrConfig->ConditionalDetrConfig
 class ConditionalDetrEncoderLayer(nn.Module):
     def __init__(self, config: ConditionalDetrConfig):
         super().__init__()
@@ -783,7 +786,8 @@ def forward(
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
             position_embeddings (`torch.FloatTensor`, *optional*): position embeddings, to be added to hidden_states.
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
@@ -1520,9 +1524,18 @@ def forward(
 
         >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
         >>> model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50")
+
+        >>> # prepare image for the model
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
+
+        >>> # forward pass
         >>> outputs = model(**inputs)
+
+        >>> # the last hidden states are the final query embeddings of the Transformer decoder
+        >>> # these are of shape (batch_size, num_queries, hidden_size)
         >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 300, 256]
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (

From bbb01a60e4b6499018b5ff05d57565899c7d29be Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:47:29 -0400
Subject: [PATCH 098/233] added conditional_detr files

---
 README.md                                     |    1 +
 README_ko.md                                  |    1 +
 README_zh-hans.md                             |    1 +
 README_zh-hant.md                             |    1 +
 docs/source/en/index.mdx                      |    2 +
 docs/source/en/model_doc/conditional_detr.mdx |   53 +
 docs/source/en/serialization.mdx              |    1 +
 src/transformers/__init__.py                  |   20 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 .../models/auto/feature_extraction_auto.py    |    2 +
 src/transformers/models/auto/modeling_auto.py |    3 +
 .../models/conditional_detr/__init__.py       |   87 +
 .../configuration_conditional_detr.py         |  242 ++
 ..._original_pytorch_checkpoint_to_pytorch.py |  298 ++
 .../feature_extraction_conditional_detr.py    |  938 ++++++
 .../modeling_conditional_detr.py              | 2547 +++++++++++++++++
 .../utils/dummy_vision_objects.py             |    7 +
 tests/models/conditional_detr/__init__.py     |    0
 ...est_feature_extraction_conditional_detr.py |  338 +++
 .../test_modeling_conditional_detr.py         |  538 ++++
 21 files changed, 5084 insertions(+)
 create mode 100644 docs/source/en/model_doc/conditional_detr.mdx
 create mode 100644 src/transformers/models/conditional_detr/__init__.py
 create mode 100644 src/transformers/models/conditional_detr/configuration_conditional_detr.py
 create mode 100644 src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
 create mode 100644 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
 create mode 100644 src/transformers/models/conditional_detr/modeling_conditional_detr.py
 create mode 100644 tests/models/conditional_detr/__init__.py
 create mode 100644 tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
 create mode 100644 tests/models/conditional_detr/test_modeling_conditional_detr.py

diff --git a/README.md b/README.md
index 31e4c5af04567..a481458cde19e 100644
--- a/README.md
+++ b/README.md
@@ -278,6 +278,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_ko.md b/README_ko.md
index a4ccc124acc2f..de1f2b3e13560 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -228,6 +228,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 34839f54a3a80..f0ffb27c4101f 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -252,6 +252,7 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang发布。
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 9e6a4a0b0ecfe..03687087f7a22 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -264,6 +264,7 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index f118359bc57be..7830c57e6eabd 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -68,6 +68,7 @@ The documentation is organized into five sections:
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
@@ -215,6 +216,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|      conditional_detr       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/en/model_doc/conditional_detr.mdx b/docs/source/en/model_doc/conditional_detr.mdx
new file mode 100644
index 0000000000000..51316a24e43ee
--- /dev/null
+++ b/docs/source/en/model_doc/conditional_detr.mdx
@@ -0,0 +1,53 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Conditional DETR
+
+## Overview
+
+The conditional DETR model was proposed in [ConditionalDETR](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR.
+
+The abstract from the paper is the following:
+
+*The recently-developed DETR approach applies the transformer encoder and decoder architecture to object detection and achieves promising performance. In this paper, we handle the critical issue, slow training convergence, and present a conditional cross-attention mechanism for fast DETR training. Our approach is motivated by that the cross-attention in DETR relies highly on the content embeddings for localizing the four extremities and predicting the box, which increases the need for high-quality content embeddings and thus the training difficulty. Our approach, named conditional DETR, learns a conditional spatial query from the decoder embedding for decoder multi-head cross-attention. The benefit is that through the conditional spatial query, each cross-attention head is able to attend to a band containing a distinct region, e.g., one object extremity or a region inside the object box. This narrows down the spatial range for localizing the distinct regions for object classification and box regression, thus relaxing the dependence on the content embeddings and easing the training. Empirical results show that conditional DETR converges 6.7× faster for the backbones R50 and R101 and 10× faster for stronger backbones DC5-R50 and DC5-R101. Code is available at https://github.com/Atten4Vis/ConditionalDETR.*
+
+
+This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The original code can be found [here](https://github.com/Atten4Vis/ConditionalDETR).
+
+
+## ConditionalDETRConfig
+
+[[autodoc]] ConditionalDETRConfig
+
+## ConditionalDETRFeatureExtractor
+
+[[autodoc]] ConditionalDETRFeatureExtractor
+    - __call__
+    - pad_and_create_pixel_mask
+    - post_process
+    - post_process_segmentation
+    - post_process_panoptic
+
+## ConditionalDETRModel
+
+[[autodoc]] ConditionalDETRModel
+    - forward
+
+## ConditionalDETRForObjectDetection
+
+[[autodoc]] ConditionalDETRForObjectDetection
+    - forward
+
+## ConditionalDETRForSegmentation
+
+[[autodoc]] ConditionalDETRForSegmentation
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 74f50c78513ce..f6304004b62ae 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -57,6 +57,7 @@ Ready-made configurations include the following architectures:
 - CamemBERT
 - CLIP
 - CodeGen
+- conditional_detr
 - ConvBERT
 - ConvNeXT
 - Data2VecText
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3c3a3a5006416..b96724b992093 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -172,6 +172,7 @@
         "CLIPVisionConfig",
     ],
     "models.codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenTokenizer"],
+    "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDETRConfig"],
     "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
     "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
     "models.cpm": [],
@@ -660,6 +661,7 @@
     _import_structure["models.convnext"].append("ConvNextFeatureExtractor")
     _import_structure["models.deit"].append("DeiTFeatureExtractor")
     _import_structure["models.detr"].append("DetrFeatureExtractor")
+    _import_structure["models.conditional_detr"].append("ConditionalDETRFeatureExtractor")
     _import_structure["models.donut"].append("DonutFeatureExtractor")
     _import_structure["models.dpt"].append("DPTFeatureExtractor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaProcessor"])
@@ -708,6 +710,15 @@
             "DetrPreTrainedModel",
         ]
     )
+    _import_structure["models.conditional_detr"].extend(
+        [
+            "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ConditionalDETRForObjectDetection",
+            "ConditionalDETRForSegmentation",
+            "ConditionalDETRModel",
+            "ConditionalDETRPreTrainedModel",
+        ]
+    )
 
 try:
     if not is_scatter_available():
@@ -3075,6 +3086,7 @@
         CLIPVisionConfig,
     )
     from .models.codegen import CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP, CodeGenConfig, CodeGenTokenizer
+    from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDETRConfig
     from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
     from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
@@ -3498,6 +3510,7 @@
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
         from .models.clip import CLIPFeatureExtractor
+        from .models.conditional_detr import ConditionalDETRFeatureExtractor
         from .models.convnext import ConvNextFeatureExtractor
         from .models.deit import DeiTFeatureExtractor
         from .models.detr import DetrFeatureExtractor
@@ -3527,6 +3540,13 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_timm_and_vision_objects import *
     else:
+        from .models.conditional_detr import (
+            CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ConditionalDETRForObjectDetection,
+            ConditionalDETRForSegmentation,
+            ConditionalDETRModel,
+            ConditionalDETRPreTrainedModel,
+        )
         from .models.deformable_detr import (
             DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
             DeformableDetrForObjectDetection,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index fbdbfd579cb9e..8a6622a9f35b6 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -38,6 +38,7 @@
     canine,
     clip,
     codegen,
+    conditional_detr,
     convbert,
     convnext,
     cpm,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 1204e6608a768..b28bb0665bc3d 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -43,6 +43,7 @@
         ("canine", "CanineConfig"),
         ("clip", "CLIPConfig"),
         ("codegen", "CodeGenConfig"),
+        ("conditional_detr", "ConditionalDETRConfig"),
         ("convbert", "ConvBertConfig"),
         ("convnext", "ConvNextConfig"),
         ("ctrl", "CTRLConfig"),
@@ -175,6 +176,7 @@
         ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("clip", "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("codegen", "CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("conditional_detr", "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convbert", "CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convnext", "CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ctrl", "CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -300,6 +302,7 @@
         ("canine", "CANINE"),
         ("clip", "CLIP"),
         ("codegen", "CodeGen"),
+        ("conditional_detr", "conditional_detr"),
         ("convbert", "ConvBERT"),
         ("convnext", "ConvNeXT"),
         ("cpm", "CPM"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 015fd132ef0dc..9ba8295b406da 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,6 +39,8 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
+        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
+        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 7f4968d03cdf6..322a48f1cefeb 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -42,6 +42,7 @@
         ("canine", "CanineModel"),
         ("clip", "CLIPModel"),
         ("codegen", "CodeGenModel"),
+        ("conditional_detr", "ConditionalDETRModel"),
         ("convbert", "ConvBertModel"),
         ("convnext", "ConvNextModel"),
         ("ctrl", "CTRLModel"),
@@ -373,6 +374,7 @@
     [
         # Do not add new models here, this class will be deprecated in the future.
         # Model for Image Segmentation mapping
+        ("conditional_detr", "ConditionalDETRForSegmentation"),
         ("detr", "DetrForSegmentation"),
     ]
 )
@@ -455,6 +457,7 @@
 MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Object Detection mapping
+        ("conditional_detr", "ConditionalDETRForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
diff --git a/src/transformers/models/conditional_detr/__init__.py b/src/transformers/models/conditional_detr/__init__.py
new file mode 100644
index 0000000000000..1b72e7d7a3238
--- /dev/null
+++ b/src/transformers/models/conditional_detr/__init__.py
@@ -0,0 +1,87 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_timm_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_conditional_detr": [
+        "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "ConditionalDETRConfig",
+        "ConditionalDETROnnxConfig",
+    ]
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["feature_extraction_conditional_detr"] = ["ConditionalDETRFeatureExtractor"]
+
+try:
+    if not is_timm_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_conditional_detr"] = [
+        "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "ConditionalDETRForObjectDetection",
+        "ConditionalDETRForSegmentation",
+        "ConditionalDETRModel",
+        "ConditionalDETRPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_conditional_detr import (
+        CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ConditionalDETRConfig,
+        ConditionalDETROnnxConfig,
+    )
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_conditional_detr import ConditionalDETRFeatureExtractor
+
+    try:
+        if not is_timm_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_conditional_detr import (
+            CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ConditionalDETRForObjectDetection,
+            ConditionalDETRForSegmentation,
+            ConditionalDETRModel,
+            ConditionalDETRPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
new file mode 100644
index 0000000000000..ce004dc6fc15c
--- /dev/null
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -0,0 +1,242 @@
+# coding=utf-8
+# Copyright 2022 Facebook AI Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CONDITIONAL_DETR model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "DeppMeng/ConditionalDETR": "https://huggingface.co/DeppMeng/ConditionalDETR/resolve/main/config.json",
+}
+
+
+class ConditionalDETRConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
+    a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
+    [DeppMeng/ConditionalDETR](https://huggingface.co/DeppMeng/ConditionalDETR) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_queries (`int`, *optional*, defaults to 100):
+            Number of object queries, i.e. detection slots. This is the maximal number of objects
+            [`ConditionalDETRModel`] can detect in a single image. For COCO, we recommend 100 queries.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers.
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        init_xavier_std (`float`, *optional*, defaults to 1):
+            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
+        backbone (`str`, *optional*, defaults to `"resnet50"`):
+            Name of convolutional backbone to use. Supports any convolutional backbone from the timm package. For a
+            list of all available models, see [this
+            page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
+        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
+            Whether to use pretrained weights for the backbone.
+        dilation (`bool`, *optional*, defaults to `False`):
+            Whether to replace stride with dilation in the last convolutional block (DC5).
+        class_cost (`float`, *optional*, defaults to 1):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        mask_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the Focal loss in the panoptic segmentation loss.
+        dice_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.1):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+
+    Examples:
+
+    ```python
+    >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
+
+    >>> # Initializing a CONDITIONAL_DETR DeppMeng/ConditionalDETR style configuration
+    >>> configuration = ConditionalDETRConfig()
+
+    >>> # Initializing a model from the DeppMeng/ConditionalDETR style configuration
+    >>> model = ConditionalDETRModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "conditional_detr"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        num_queries=300,
+        max_position_embeddings=1024,
+        encoder_layers=6,
+        encoder_ffn_dim=2048,
+        encoder_attention_heads=8,
+        decoder_layers=6,
+        decoder_ffn_dim=2048,
+        decoder_attention_heads=8,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        init_xavier_std=1.0,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        auxiliary_loss=False,
+        position_embedding_type="sine",
+        backbone="resnet50",
+        dilation=False,
+        class_cost=2,
+        bbox_cost=5,
+        giou_cost=2,
+        mask_loss_coefficient=1,
+        dice_loss_coefficient=1,
+        cls_loss_coefficient=2,
+        bbox_loss_coefficient=5,
+        giou_loss_coefficient=2,
+        focal_alpha=0.25,
+        **kwargs
+    ):
+        self.num_queries = num_queries
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.auxiliary_loss = auxiliary_loss
+        self.position_embedding_type = position_embedding_type
+        self.backbone = backbone
+        self.dilation = dilation
+        # Hungarian matcher
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        # Loss coefficients
+        self.mask_loss_coefficient = mask_loss_coefficient
+        self.dice_loss_coefficient = dice_loss_coefficient
+        self.cls_loss_coefficient = cls_loss_coefficient
+        self.bbox_loss_coefficient = bbox_loss_coefficient
+        self.giou_loss_coefficient = giou_loss_coefficient
+        self.focal_alpha = focal_alpha
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+
+class ConditionalDETROnnxConfig(OnnxConfig):
+
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("pixel_mask", {0: "batch"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-5
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 12
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000..2d7abc3c088b5
--- /dev/null
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,298 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert CONDITIONAL_DETR checkpoints."""
+
+
+import argparse
+import json
+from collections import OrderedDict
+from pathlib import Path
+
+import torch
+from PIL import Image
+
+import requests
+from huggingface_hub import hf_hub_download
+from transformers import (
+    ConditionalDETRConfig,
+    ConditionalDETRFeatureExtractor,
+    ConditionalDETRForObjectDetection,
+    ConditionalDETRForSegmentation,
+)
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+rename_keys = []
+for i in range(6):
+    # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
+    )
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
+    )
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
+    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
+    )
+    rename_keys.append(
+        (
+            f"transformer.decoder.layers.{i}.cross_attn.out_proj.weight",
+            f"decoder.layers.{i}.encoder_attn.out_proj.weight",
+        )
+    )
+    rename_keys.append(
+        (
+            f"transformer.decoder.layers.{i}.cross_attn.out_proj.bias",
+            f"decoder.layers.{i}.encoder_attn.out_proj.bias",
+        )
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
+
+    # q, k, v projections in self/cross-attention in decoder for conditional DETR
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.weight", f"decoder.layers.{i}.sa_v_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight"))
+    # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.weight", f"decoder.layers.{i}.ca_qpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.weight", f"decoder.layers.{i}.ca_v_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight"))
+
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.bias", f"decoder.layers.{i}.sa_qpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.bias", f"decoder.layers.{i}.sa_kpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.bias", f"decoder.layers.{i}.sa_v_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias"))
+    # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.bias", f"decoder.layers.{i}.ca_qpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.bias", f"decoder.layers.{i}.ca_kpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.bias", f"decoder.layers.{i}.ca_v_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias"))
+
+# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
+# for conditional DETR, also convert reference point head and query scale MLP
+rename_keys.extend(
+    [
+        ("input_proj.weight", "input_projection.weight"),
+        ("input_proj.bias", "input_projection.bias"),
+        ("query_embed.weight", "query_position_embeddings.weight"),
+        ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
+        ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
+        ("class_embed.weight", "class_labels_classifier.weight"),
+        ("class_embed.bias", "class_labels_classifier.bias"),
+        ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
+        ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
+        ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
+        ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
+        ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
+        ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
+        ("transformer.decoder.ref_point_head.layers.0.weight", "decoder.ref_point_head.layers.0.weight"),
+        ("transformer.decoder.ref_point_head.layers.0.bias", "decoder.ref_point_head.layers.0.bias"),
+        ("transformer.decoder.ref_point_head.layers.1.weight", "decoder.ref_point_head.layers.1.weight"),
+        ("transformer.decoder.ref_point_head.layers.1.bias", "decoder.ref_point_head.layers.1.bias"),
+        ("transformer.decoder.query_scale.layers.0.weight", "decoder.query_scale.layers.0.weight"),
+        ("transformer.decoder.query_scale.layers.0.bias", "decoder.query_scale.layers.0.bias"),
+        ("transformer.decoder.query_scale.layers.1.weight", "decoder.query_scale.layers.1.weight"),
+        ("transformer.decoder.query_scale.layers.1.bias", "decoder.query_scale.layers.1.bias"),
+        ("transformer.decoder.layers.0.ca_qpos_proj.weight", "decoder.layers.0.ca_qpos_proj.weight"),
+        ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias")
+    ]
+)
+
+
+def rename_key(state_dict, old, new):
+    val = state_dict.pop(old)
+    state_dict[new] = val
+
+
+def rename_backbone_keys(state_dict):
+    new_state_dict = OrderedDict()
+    for key, value in state_dict.items():
+        if "backbone.0.body" in key:
+            new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
+            new_state_dict[new_key] = value
+        else:
+            new_state_dict[key] = value
+
+    return new_state_dict
+
+
+def read_in_q_k_v(state_dict, is_panoptic=False):
+    prefix = ""
+    if is_panoptic:
+        prefix = "conditional_detr."
+
+    # first: transformer encoder
+    for i in range(6):
+        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+
+    return im
+
+
+@torch.no_grad()
+def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our CONDITIONAL_DETR structure.
+    """
+
+    # load default config
+    config = ConditionalDETRConfig()
+    # set backbone and dilation attributes
+    if "resnet101" in model_name:
+        config.backbone = "resnet101"
+    if "dc5" in model_name:
+        config.dilation = True
+    is_panoptic = "panoptic" in model_name
+    if is_panoptic:
+        config.num_labels = 250
+    else:
+        config.num_labels = 91
+        repo_id = "datasets/huggingface/label-files"
+        filename = "coco-detection-id2label.json"
+        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
+        id2label = {int(k): v for k, v in id2label.items()}
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
+
+    # load feature extractor
+    format = "coco_panoptic" if is_panoptic else "coco_detection"
+    feature_extractor = ConditionalDETRFeatureExtractor(format=format)
+
+    # prepare image
+    img = prepare_img()
+    encoding = feature_extractor(images=img, return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+
+    logger.info(f"Converting model {model_name}...")
+
+    # load original model from torch hub
+    conditional_detr = torch.hub.load("DeppMeng/ConditionalDETR", model_name, pretrained=True).eval()
+    state_dict = conditional_detr.state_dict()
+    # rename keys
+    for src, dest in rename_keys:
+        if is_panoptic:
+            src = "conditional_detr." + src
+        rename_key(state_dict, src, dest)
+    state_dict = rename_backbone_keys(state_dict)
+    # query, key and value matrices need special treatment
+    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
+    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
+    prefix = "conditional_detr.model." if is_panoptic else "model."
+    for key in state_dict.copy().keys():
+        if is_panoptic:
+            if (
+                key.startswith("conditional_detr")
+                and not key.startswith("class_labels_classifier")
+                and not key.startswith("bbox_predictor")
+            ):
+                val = state_dict.pop(key)
+                state_dict["conditional_detr.model" + key[4:]] = val
+            elif "class_labels_classifier" in key or "bbox_predictor" in key:
+                val = state_dict.pop(key)
+                state_dict["conditional_detr." + key] = val
+            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
+                continue
+            else:
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+        else:
+            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+    # finally, create HuggingFace model and load state dict
+    model = ConditionalDETRForSegmentation(config) if is_panoptic else ConditionalDETRForObjectDetection(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+    # verify our conversion
+    original_outputs = conditional_detr(pixel_values)
+    outputs = model(pixel_values)
+    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4)
+    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4)
+    if is_panoptic:
+        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
+
+    # Save model and feature extractor
+    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    model.save_pretrained(pytorch_dump_folder_path)
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_name",
+        default="conditional_detr_resnet50",
+        type=str,
+        help="Name of the CONDITIONAL_DETR model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_conditional_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
new file mode 100644
index 0000000000000..07550124c157e
--- /dev/null
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -0,0 +1,938 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for CONDITIONAL_DETR."""
+
+import io
+import pathlib
+from collections import defaultdict
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+from PIL import Image
+
+from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor
+from ...utils import TensorType, is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+logger = logging.get_logger(__name__)
+
+
+ImageInput = Union[Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"]]
+
+
+# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+def center_to_corners_format(x):
+    """
+    Converts a PyTorch tensor of bounding boxes of center format (center_x, center_y, width, height) to corners format
+    (x_0, y_0, x_1, y_1).
+    """
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def corners_to_center_format(x):
+    """
+    Converts a NumPy array of bounding boxes of shape (number of bounding boxes, 4) of corners format (x_0, y_0, x_1,
+    y_1) to center format (center_x, center_y, width, height).
+    """
+    x_transposed = x.T
+    x0, y0, x1, y1 = x_transposed[0], x_transposed[1], x_transposed[2], x_transposed[3]
+    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
+    return np.stack(b, axis=-1)
+
+
+def masks_to_boxes(masks):
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+
+    Returns a [N, 4] tensor, with the boxes in corner (xyxy) format.
+    """
+    if masks.size == 0:
+        return np.zeros((0, 4))
+
+    h, w = masks.shape[-2:]
+
+    y = np.arange(0, h, dtype=np.float32)
+    x = np.arange(0, w, dtype=np.float32)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = np.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * np.expand_dims(x, axis=0)
+    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
+    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
+    x_min = x.filled(fill_value=1e8)
+    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
+
+    y_mask = masks * np.expand_dims(y, axis=0)
+    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
+    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
+    y_min = y.filled(fill_value=1e8)
+    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
+
+    return np.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
+def rgb_to_id(color):
+    if isinstance(color, np.ndarray) and len(color.shape) == 3:
+        if color.dtype == np.uint8:
+            color = color.astype(np.int32)
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+    return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+
+def id_to_rgb(id_map):
+    if isinstance(id_map, np.ndarray):
+        id_map_copy = id_map.copy()
+        rgb_shape = tuple(list(id_map.shape) + [3])
+        rgb_map = np.zeros(rgb_shape, dtype=np.uint8)
+        for i in range(3):
+            rgb_map[..., i] = id_map_copy % 256
+            id_map_copy //= 256
+        return rgb_map
+    color = []
+    for _ in range(3):
+        color.append(id_map % 256)
+        id_map //= 256
+    return color
+
+
+class ConditionalDETRFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+    r"""
+    Constructs a CONDITIONAL_DETR feature extractor.
+
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+
+    Args:
+        format (`str`, *optional*, defaults to `"coco_detection"`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int`, *optional*, defaults to 800):
+            Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a
+            sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
+            the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
+            height / width, size)`.
+        max_size (`int`, *optional*, defaults to `1333`):
+            The largest size an image dimension can have (otherwise it's capped). Only has an effect if `do_resize` is
+            set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
+            The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
+        image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
+            ImageNet std.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        format="coco_detection",
+        do_resize=True,
+        size=800,
+        max_size=1333,
+        do_normalize=True,
+        image_mean=None,
+        image_std=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.format = self._is_valid_format(format)
+        self.do_resize = do_resize
+        self.size = size
+        self.max_size = max_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.485, 0.456, 0.406]  # ImageNet mean
+        self.image_std = image_std if image_std is not None else [0.229, 0.224, 0.225]  # ImageNet std
+
+    def _is_valid_format(self, format):
+        if format not in ["coco_detection", "coco_panoptic"]:
+            raise ValueError(f"Format {format} not supported")
+        return format
+
+    def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
+        if self.format == "coco_detection":
+            image, target = self.prepare_coco_detection(image, target, return_segmentation_masks)
+            return image, target
+        elif self.format == "coco_panoptic":
+            image, target = self.prepare_coco_panoptic(image, target, masks_path)
+            return image, target
+        else:
+            raise ValueError(f"Format {self.format} not supported")
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33
+    def convert_coco_poly_to_mask(self, segmentations, height, width):
+
+        try:
+            from pycocotools import mask as coco_mask
+        except ImportError:
+            raise ImportError("Pycocotools is not installed in your environment.")
+
+        masks = []
+        for polygons in segmentations:
+            rles = coco_mask.frPyObjects(polygons, height, width)
+            mask = coco_mask.decode(rles)
+            if len(mask.shape) < 3:
+                mask = mask[..., None]
+            mask = np.asarray(mask, dtype=np.uint8)
+            mask = np.any(mask, axis=2)
+            masks.append(mask)
+        if masks:
+            masks = np.stack(masks, axis=0)
+        else:
+            masks = np.zeros((0, height, width), dtype=np.uint8)
+
+        return masks
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50
+    def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
+        """
+        Convert the target in COCO format into the format expected by CONDITIONAL_DETR.
+        """
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = np.asarray([image_id], dtype=np.int64)
+
+        # get all COCO annotations for the given image
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=w)
+        boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = np.asarray(classes, dtype=np.int64)
+
+        if return_segmentation_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = self.convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = np.asarray(keypoints, dtype=np.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.reshape((-1, 3))
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if return_segmentation_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["class_labels"] = classes
+        if return_segmentation_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = np.asarray([obj["area"] for obj in anno], dtype=np.float32)
+        iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno], dtype=np.int64)
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+
+        target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+        target["size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+
+        return image, target
+
+    def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
+        w, h = image.size
+        ann_info = target.copy()
+        ann_path = pathlib.Path(masks_path) / ann_info["file_name"]
+
+        if "segments_info" in ann_info:
+            masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
+            masks = rgb_to_id(masks)
+
+            ids = np.array([ann["id"] for ann in ann_info["segments_info"]])
+            masks = masks == ids[:, None, None]
+            masks = np.asarray(masks, dtype=np.uint8)
+
+            labels = np.asarray([ann["category_id"] for ann in ann_info["segments_info"]], dtype=np.int64)
+
+        target = {}
+        target["image_id"] = np.asarray(
+            [ann_info["image_id"] if "image_id" in ann_info else ann_info["id"]], dtype=np.int64
+        )
+        if return_masks:
+            target["masks"] = masks
+        target["class_labels"] = labels
+
+        target["boxes"] = masks_to_boxes(masks)
+
+        target["size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+        target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+        if "segments_info" in ann_info:
+            target["iscrowd"] = np.asarray([ann["iscrowd"] for ann in ann_info["segments_info"]], dtype=np.int64)
+            target["area"] = np.asarray([ann["area"] for ann in ann_info["segments_info"]], dtype=np.float32)
+
+        return image, target
+
+    def _resize(self, image, size, target=None, max_size=None):
+        """
+        Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
+        edge of the image will be matched to this number.
+
+        If given, also resize the target accordingly.
+        """
+        if not isinstance(image, Image.Image):
+            image = self.to_pil_image(image)
+
+        def get_size_with_aspect_ratio(image_size, size, max_size=None):
+            w, h = image_size
+            if max_size is not None:
+                min_original_size = float(min((w, h)))
+                max_original_size = float(max((w, h)))
+                if max_original_size / min_original_size * size > max_size:
+                    size = int(round(max_size * min_original_size / max_original_size))
+
+            if (w <= h and w == size) or (h <= w and h == size):
+                return (h, w)
+
+            if w < h:
+                ow = size
+                oh = int(size * h / w)
+            else:
+                oh = size
+                ow = int(size * w / h)
+
+            return (oh, ow)
+
+        def get_size(image_size, size, max_size=None):
+            if isinstance(size, (list, tuple)):
+                return size
+            else:
+                # size returned must be (w, h) since we use PIL to resize images
+                # so we revert the tuple
+                return get_size_with_aspect_ratio(image_size, size, max_size)[::-1]
+
+        size = get_size(image.size, size, max_size)
+        rescaled_image = self.resize(image, size=size)
+
+        if target is None:
+            return rescaled_image, None
+
+        ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+        ratio_width, ratio_height = ratios
+
+        target = target.copy()
+        if "boxes" in target:
+            boxes = target["boxes"]
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            target["boxes"] = scaled_boxes
+
+        if "area" in target:
+            area = target["area"]
+            scaled_area = area * (ratio_width * ratio_height)
+            target["area"] = scaled_area
+
+        w, h = size
+        target["size"] = np.asarray([h, w], dtype=np.int64)
+
+        if "masks" in target:
+            # use PyTorch as current workaround
+            # TODO replace by self.resize
+            masks = torch.from_numpy(target["masks"][:, None]).float()
+            interpolated_masks = nn.functional.interpolate(masks, size=(h, w), mode="nearest")[:, 0] > 0.5
+            target["masks"] = interpolated_masks.numpy()
+
+        return rescaled_image, target
+
+    def _normalize(self, image, mean, std, target=None):
+        """
+        Normalize the image with a certain mean and std.
+
+        If given, also normalize the target bounding boxes based on the size of the image.
+        """
+
+        image = self.normalize(image, mean=mean, std=std)
+        if target is None:
+            return image, None
+
+        target = target.copy()
+        h, w = image.shape[-2:]
+
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = corners_to_center_format(boxes)
+            boxes = boxes / np.asarray([w, h, w, h], dtype=np.float32)
+            target["boxes"] = boxes
+
+        return image, target
+
+    def __call__(
+        self,
+        images: ImageInput,
+        annotations: Union[List[Dict], List[List[Dict]]] = None,
+        return_segmentation_masks: Optional[bool] = False,
+        masks_path: Optional[pathlib.Path] = None,
+        pad_and_return_pixel_mask: Optional[bool] = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several image(s) and optional annotations. Images are by default
+        padded up to the largest image in a batch, and a pixel mask is created that indicates which pixels are
+        real/which are padding.
+
+        <Tip warning={true}>
+
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            annotations (`Dict`, `List[Dict]`, *optional*):
+                The corresponding annotations in COCO format.
+
+                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_detection"`, the
+                annotations for each image should have the following format: {'image_id': int, 'annotations':
+                [annotation]}, with the annotations being a list of COCO object annotations.
+
+                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the
+                annotations for each image should have the following format: {'image_id': int, 'file_name': str,
+                'segments_info': [segment_info]} with segments_info being a list of COCO panoptic annotations.
+
+            return_segmentation_masks (`Dict`, `List[Dict]`, *optional*, defaults to `False`):
+                Whether to also include instance segmentation masks as part of the labels in case `format =
+                "coco_detection"`.
+
+            masks_path (`pathlib.Path`, *optional*):
+                Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only
+                relevant in case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
+
+            pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether or not to pad images up to the largest image in a batch and create a pixel mask.
+
+                If left to the default, will return a pixel mask that is:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
+                objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model.
+            - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
+              *"pixel_mask"* is in `self.model_input_names`).
+            - **labels** -- Optional labels to be fed to a model (when `annotations` are provided)
+        """
+        # Input type checking for clearer error
+
+        valid_images = False
+        valid_annotations = False
+        valid_masks_path = False
+
+        # Check that images has a valid type
+        if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
+            valid_images = True
+        elif isinstance(images, (list, tuple)):
+            if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
+                valid_images = True
+
+        if not valid_images:
+            raise ValueError(
+                "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
+                "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
+            )
+
+        is_batched = bool(
+            isinstance(images, (list, tuple))
+            and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
+        )
+
+        # Check that annotations has a valid type
+        if annotations is not None:
+            if not is_batched:
+                if self.format == "coco_detection":
+                    if isinstance(annotations, dict) and "image_id" in annotations and "annotations" in annotations:
+                        if isinstance(annotations["annotations"], (list, tuple)):
+                            # an image can have no annotations
+                            if len(annotations["annotations"]) == 0 or isinstance(annotations["annotations"][0], dict):
+                                valid_annotations = True
+                elif self.format == "coco_panoptic":
+                    if isinstance(annotations, dict) and "image_id" in annotations and "segments_info" in annotations:
+                        if isinstance(annotations["segments_info"], (list, tuple)):
+                            # an image can have no segments (?)
+                            if len(annotations["segments_info"]) == 0 or isinstance(
+                                annotations["segments_info"][0], dict
+                            ):
+                                valid_annotations = True
+            else:
+                if isinstance(annotations, (list, tuple)):
+                    if len(images) != len(annotations):
+                        raise ValueError("There must be as many annotations as there are images")
+                    if isinstance(annotations[0], Dict):
+                        if self.format == "coco_detection":
+                            if isinstance(annotations[0]["annotations"], (list, tuple)):
+                                valid_annotations = True
+                        elif self.format == "coco_panoptic":
+                            if isinstance(annotations[0]["segments_info"], (list, tuple)):
+                                valid_annotations = True
+
+            if not valid_annotations:
+                raise ValueError(
+                    """
+                    Annotations must of type `Dict` (single image) or `List[Dict]` (batch of images). In case of object
+                    detection, each dictionary should contain the keys 'image_id' and 'annotations', with the latter
+                    being a list of annotations in COCO format. In case of panoptic segmentation, each dictionary
+                    should contain the keys 'file_name', 'image_id' and 'segments_info', with the latter being a list
+                    of annotations in COCO format.
+                    """
+                )
+
+        # Check that masks_path has a valid type
+        if masks_path is not None:
+            if self.format == "coco_panoptic":
+                if isinstance(masks_path, pathlib.Path):
+                    valid_masks_path = True
+                if not valid_masks_path:
+                    raise ValueError(
+                        "The path to the directory containing the mask PNG files should be provided as a"
+                        " `pathlib.Path` object."
+                    )
+
+        if not is_batched:
+            images = [images]
+            if annotations is not None:
+                annotations = [annotations]
+
+        # prepare (COCO annotations as a list of Dict -> CONDITIONAL_DETR target as a single Dict per image)
+        if annotations is not None:
+            for idx, (image, target) in enumerate(zip(images, annotations)):
+                if not isinstance(image, Image.Image):
+                    image = self.to_pil_image(image)
+                image, target = self.prepare(image, target, return_segmentation_masks, masks_path)
+                images[idx] = image
+                annotations[idx] = target
+
+        # transformations (resizing + normalization)
+        if self.do_resize and self.size is not None:
+            if annotations is not None:
+                for idx, (image, target) in enumerate(zip(images, annotations)):
+                    image, target = self._resize(image=image, target=target, size=self.size, max_size=self.max_size)
+                    images[idx] = image
+                    annotations[idx] = target
+            else:
+                for idx, image in enumerate(images):
+                    images[idx] = self._resize(image=image, target=None, size=self.size, max_size=self.max_size)[0]
+
+        if self.do_normalize:
+            if annotations is not None:
+                for idx, (image, target) in enumerate(zip(images, annotations)):
+                    image, target = self._normalize(
+                        image=image, mean=self.image_mean, std=self.image_std, target=target
+                    )
+                    images[idx] = image
+                    annotations[idx] = target
+            else:
+                images = [
+                    self._normalize(image=image, mean=self.image_mean, std=self.image_std)[0] for image in images
+                ]
+
+        if pad_and_return_pixel_mask:
+            # pad images up to largest image in batch and create pixel_mask
+            max_size = self._max_by_axis([list(image.shape) for image in images])
+            c, h, w = max_size
+            padded_images = []
+            pixel_mask = []
+            for image in images:
+                # create padded image
+                padded_image = np.zeros((c, h, w), dtype=np.float32)
+                padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
+                padded_images.append(padded_image)
+                # create pixel mask
+                mask = np.zeros((h, w), dtype=np.int64)
+                mask[: image.shape[1], : image.shape[2]] = True
+                pixel_mask.append(mask)
+            images = padded_images
+
+        # return as BatchFeature
+        data = {}
+        data["pixel_values"] = images
+        if pad_and_return_pixel_mask:
+            data["pixel_mask"] = pixel_mask
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        if annotations is not None:
+            # Convert to TensorType
+            tensor_type = return_tensors
+            if not isinstance(tensor_type, TensorType):
+                tensor_type = TensorType(tensor_type)
+
+            if not tensor_type == TensorType.PYTORCH:
+                raise ValueError("Only PyTorch is supported for the moment.")
+            else:
+                if not is_torch_available():
+                    raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
+
+                encoded_inputs["labels"] = [
+                    {k: torch.from_numpy(v) for k, v in target.items()} for target in annotations
+                ]
+
+        return encoded_inputs
+
+    def _max_by_axis(self, the_list):
+        # type: (List[List[int]]) -> List[int]
+        maxes = the_list[0]
+        for sublist in the_list[1:]:
+            for index, item in enumerate(sublist):
+                maxes[index] = max(maxes[index], item)
+        return maxes
+
+    def pad_and_create_pixel_mask(
+        self, pixel_values_list: List["torch.Tensor"], return_tensors: Optional[Union[str, TensorType]] = None
+    ):
+        """
+        Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
+
+        Args:
+            pixel_values_list (`List[torch.Tensor]`):
+                List of images (pixel values) to be padded. Each image should be a tensor of shape (C, H, W).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
+                objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model.
+            - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
+              *"pixel_mask"* is in `self.model_input_names`).
+
+        """
+
+        max_size = self._max_by_axis([list(image.shape) for image in pixel_values_list])
+        c, h, w = max_size
+        padded_images = []
+        pixel_mask = []
+        for image in pixel_values_list:
+            # create padded image
+            padded_image = np.zeros((c, h, w), dtype=np.float32)
+            padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
+            padded_images.append(padded_image)
+            # create pixel mask
+            mask = np.zeros((h, w), dtype=np.int64)
+            mask[: image.shape[1], : image.shape[2]] = True
+            pixel_mask.append(mask)
+
+        # return as BatchFeature
+        data = {"pixel_values": padded_images, "pixel_mask": pixel_mask}
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        return encoded_inputs
+
+    # POSTPROCESSING METHODS
+    # inspired by https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py#L258
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the output of [`ConditionalDETRForObjectDetection`] into the format expected by the COCO api. Only
+        supports PyTorch.
+
+        Args:
+            outputs ([`ConditionalDETRObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation). For visualization, this should be the image size after data
+                augment, but before padding.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        scores = topk_values
+        topk_boxes = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+    def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
+        """
+        Converts the output of [`ConditionalDETRForSegmentation`] into image segmentation predictions. Only supports
+        PyTorch.
+
+        Parameters:
+            outputs ([`ConditionalDETRSegmentationOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
+            threshold (`float`, *optional*, defaults to 0.9):
+                Threshold to use to filter out queries.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, raw_masks = outputs.logits, outputs.pred_masks
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.cpu().tolist())
+
+        for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
+            # we filter empty queries and detection below threshold
+            scores, labels = cur_logits.softmax(-1).max(-1)
+            keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
+            cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
+            cur_scores = cur_scores[keep]
+            cur_classes = cur_classes[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1
+
+            predictions = {"scores": cur_scores, "labels": cur_classes, "masks": cur_masks}
+            preds.append(predictions)
+        return preds
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
+    def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
+        """
+        Converts the output of [`ConditionalDETRForSegmentation`] into actual instance segmentation predictions. Only
+        supports PyTorch.
+
+        Args:
+            results (`List[Dict]`):
+                Results list obtained by [`~ConditionalDETRFeatureExtractor.post_process`], to which "masks" results
+                will be added.
+            outputs ([`ConditionalDETRSegmentationOutput`]):
+                Raw outputs of the model.
+            orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation).
+            max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation).
+            threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an
+            image in the batch as predicted by the model.
+        """
+
+        if len(orig_target_sizes) != len(max_target_sizes):
+            raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes")
+        max_h, max_w = max_target_sizes.max(0)[0].tolist()
+        outputs_masks = outputs.pred_masks.squeeze(2)
+        outputs_masks = nn.functional.interpolate(
+            outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
+        )
+        outputs_masks = (outputs_masks.sigmoid() > threshold).cpu()
+
+        for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
+            img_h, img_w = t[0], t[1]
+            results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
+            results[i]["masks"] = nn.functional.interpolate(
+                results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
+            ).byte()
+
+        return results
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
+    def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
+        """
+        Converts the output of [`ConditionalDETRForSegmentation`] into actual panoptic predictions. Only supports
+        PyTorch.
+
+        Parameters:
+            outputs ([`ConditionalDETRSegmentationOutput`]):
+                Raw outputs of the model.
+            processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
+                augmentation but before batching.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`, *optional*):
+                Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. If left to
+                None, it will default to the `processed_sizes`.
+            is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
+                Dictionary mapping class indices to either True or False, depending on whether or not they are a thing.
+                If not set, defaults to the `is_thing_map` of COCO panoptic.
+            threshold (`float`, *optional*, defaults to 0.85):
+                Threshold to use to filter out queries.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for
+            an image in the batch as predicted by the model.
+        """
+        if target_sizes is None:
+            target_sizes = processed_sizes
+        if len(processed_sizes) != len(target_sizes):
+            raise ValueError("Make sure to pass in as many processed_sizes as target_sizes")
+
+        if is_thing_map is None:
+            # default to is_thing_map of COCO panoptic
+            is_thing_map = {i: i <= 90 for i in range(201)}
+
+        out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes
+        if not len(out_logits) == len(raw_masks) == len(target_sizes):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks"
+            )
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.cpu().tolist())
+
+        for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
+            out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
+        ):
+            # we filter empty queries and detection below threshold
+            scores, labels = cur_logits.softmax(-1).max(-1)
+            keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
+            cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
+            cur_scores = cur_scores[keep]
+            cur_classes = cur_classes[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_boxes = center_to_corners_format(cur_boxes[keep])
+
+            h, w = cur_masks.shape[-2:]
+            if len(cur_boxes) != len(cur_classes):
+                raise ValueError("Not as many boxes as there are classes")
+
+            # It may be that we have several predicted masks for the same stuff class.
+            # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+            cur_masks = cur_masks.flatten(1)
+            stuff_equiv_classes = defaultdict(lambda: [])
+            for k, label in enumerate(cur_classes):
+                if not is_thing_map[label.item()]:
+                    stuff_equiv_classes[label.item()].append(k)
+
+            def get_ids_area(masks, scores, dedup=False):
+                # This helper function creates the final panoptic segmentation image
+                # It also returns the area of the masks that appears on the image
+
+                m_id = masks.transpose(0, 1).softmax(-1)
+
+                if m_id.shape[-1] == 0:
+                    # We didn't detect any mask :(
+                    m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
+                else:
+                    m_id = m_id.argmax(-1).view(h, w)
+
+                if dedup:
+                    # Merge the masks corresponding to the same stuff class
+                    for equiv in stuff_equiv_classes.values():
+                        if len(equiv) > 1:
+                            for eq_id in equiv:
+                                m_id.masked_fill_(m_id.eq(eq_id), equiv[0])
+
+                final_h, final_w = to_tuple(target_size)
+
+                seg_img = Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy()))
+                seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST)
+
+                np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes()))
+                np_seg_img = np_seg_img.view(final_h, final_w, 3)
+                np_seg_img = np_seg_img.numpy()
+
+                m_id = torch.from_numpy(rgb_to_id(np_seg_img))
+
+                area = []
+                for i in range(len(scores)):
+                    area.append(m_id.eq(i).sum().item())
+                return area, seg_img
+
+            area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
+            if cur_classes.numel() > 0:
+                # We know filter empty masks as long as we find some
+                while True:
+                    filtered_small = torch.as_tensor(
+                        [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device
+                    )
+                    if filtered_small.any().item():
+                        cur_scores = cur_scores[~filtered_small]
+                        cur_classes = cur_classes[~filtered_small]
+                        cur_masks = cur_masks[~filtered_small]
+                        area, seg_img = get_ids_area(cur_masks, cur_scores)
+                    else:
+                        break
+
+            else:
+                cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device)
+
+            segments_info = []
+            for i, a in enumerate(area):
+                cat = cur_classes[i].item()
+                segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a})
+            del cur_classes
+
+            with io.BytesIO() as out:
+                seg_img.save(out, format="PNG")
+                predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+            preds.append(predictions)
+        return preds
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
new file mode 100644
index 0000000000000..1f63d22be27a5
--- /dev/null
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -0,0 +1,2547 @@
+# coding=utf-8
+# Copyright 2022 Attn4Vis and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Conditional DETR model."""
+
+
+import math
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_int_div
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_scipy_available,
+    is_timm_available,
+    is_vision_available,
+    logging,
+    replace_return_docstrings,
+    requires_backends,
+)
+from .configuration_conditional_detr import ConditionalDETRConfig
+
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+if is_vision_available():
+    from .feature_extraction_conditional_detr import center_to_corners_format
+
+if is_timm_available():
+    from timm import create_model
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "ConditionalDETRConfig"
+_CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
+
+CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Atten4Vis/ConditionalDETR",
+    # See all conditional_detr models at https://huggingface.co/models?filter=conditional_detr
+]
+
+
+
+@dataclass
+class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
+    """
+    Base class for outputs of the Conditional DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+            layernorm.
+    """
+
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    reference_points: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class ConditionalDETRModelOutput(Seq2SeqModelOutput):
+    """
+    Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+            layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+            layernorm.
+    """
+
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    reference_points: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->ConditionalDETR
+class ConditionalDETRObjectDetectionOutput(ModelOutput):
+    """
+    Output type of [`ConditionalDETRForObjectDetection`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
+            boxes.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+            layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.detr.modeling_detr.DetrSegmentationOutput with Detr->ConditionalDETR
+class ConditionalDETRSegmentationOutput(ModelOutput):
+    """
+    Output type of [`ConditionalDETRForSegmentation`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
+            boxes.
+        pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
+            Segmentation masks logits for all queries. See also [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
+            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation masks
+            respectively.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+            layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    pred_masks: torch.FloatTensor = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+# BELOW: utilities copied from
+# https://github.com/facebookresearch/detr/blob/master/backbone.py
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDETR
+class ConditionalDETRFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super(ConditionalDETRFrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(ConditionalDETRFrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->ConditionalDETR
+def replace_batch_norm(m, name=""):
+    for attr_str in dir(m):
+        target_attr = getattr(m, attr_str)
+        if isinstance(target_attr, nn.BatchNorm2d):
+            frozen = ConditionalDETRFrozenBatchNorm2d(target_attr.num_features)
+            bn = getattr(m, attr_str)
+            frozen.weight.data.copy_(bn.weight)
+            frozen.bias.data.copy_(bn.bias)
+            frozen.running_mean.data.copy_(bn.running_mean)
+            frozen.running_var.data.copy_(bn.running_var)
+            setattr(m, attr_str, frozen)
+    for n, ch in m.named_children():
+        replace_batch_norm(ch, n)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrTimmConvEncoder with Detr->ConditionalDETR
+class ConditionalDETRTimmConvEncoder(nn.Module):
+    """
+    Convolutional encoder (backbone) from the timm library.
+
+    nn.BatchNorm2d layers are replaced by ConditionalDETRFrozenBatchNorm2d as defined above.
+
+    """
+
+    def __init__(self, name: str, dilation: bool):
+        super().__init__()
+
+        kwargs = {}
+        if dilation:
+            kwargs["output_stride"] = 16
+
+        requires_backends(self, ["timm"])
+
+        backbone = create_model(name, pretrained=True, features_only=True, out_indices=(1, 2, 3, 4), **kwargs)
+        # replace batch norm by frozen batch norm
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = self.model.feature_info.channels()
+
+        if "resnet" in name:
+            for name, parameter in self.model.named_parameters():
+                if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                    parameter.requires_grad_(False)
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values)
+
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->ConditionalDETR
+class ConditionalDETRConvModel(nn.Module):
+    """
+    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
+    """
+
+    def __init__(self, conv_encoder, position_embedding):
+        super().__init__()
+        self.conv_encoder = conv_encoder
+        self.position_embedding = position_embedding
+
+    def forward(self, pixel_values, pixel_mask):
+        # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples
+        out = self.conv_encoder(pixel_values, pixel_mask)
+        pos = []
+        for feature_map, mask in out:
+            # position encoding
+            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
+
+        return out, pos
+
+# Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDETR
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrSinePositionEmbedding with Detr->ConditionalDETR
+class ConditionalDETRSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, pixel_values, pixel_mask):
+        if pixel_mask is None:
+            raise ValueError("No pixel mask provided")
+        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
+
+        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
+        dim_t = self.temperature ** (2 * torch_int_div(dim_t, 2) / self.embedding_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->ConditionalDETR
+class ConditionalDETRLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, embedding_dim=256):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+    def forward(self, pixel_values, pixel_mask=None):
+        h, w = pixel_values.shape[-2:]
+        i = torch.arange(w, device=pixel_values.device)
+        j = torch.arange(h, device=pixel_values.device)
+        x_emb = self.column_embeddings(i)
+        y_emb = self.row_embeddings(j)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(h, 1, 1), y_emb.unsqueeze(1).repeat(1, w, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->ConditionalDETR
+def build_position_encoding(config):
+    n_steps = config.d_model // 2
+    if config.position_embedding_type == "sine":
+        # TODO find a better way of exposing other arguments
+        position_embedding = ConditionalDETRSinePositionEmbedding(n_steps, normalize=True)
+    elif config.position_embedding_type == "learned":
+        position_embedding = ConditionalDETRLearnedPositionEmbedding(n_steps)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+
+    return position_embedding
+
+# function to generate sine positional embedding for 2d coordinates
+def gen_sineembed_for_position(pos_tensor):
+    scale = 2 * math.pi
+    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000 ** (2 * (dim_t // 2) / 128)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos = torch.cat((pos_y, pos_x), dim=2)
+    return pos
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1/x2)
+
+class DetrAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the CONDITIONAL_DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        key_value_states: Optional[torch.Tensor] = None,
+        key_value_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # add key-value position embeddings to the key value states
+        if key_value_position_embeddings is not None:
+            key_value_states_original = key_value_states
+            key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states_original), -1, bsz)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states_original), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+class ConditionalDETRAttention(nn.Module):
+    """
+    Cross-Attention used in Conditional DETR 'Conditional DETR for Fast Training Convergence' paper.
+
+    The key q_proj, k_proj, v_proj are defined outside the attention.
+    This attention allows the dim of q, k to be different to v.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        out_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+            )
+        # head dimension of values
+        self.v_head_dim = out_dim // num_heads
+        if self.v_head_dim * num_heads != self.out_dim:
+            raise ValueError(
+                f"out_dim must be divisible by num_heads (got `out_dim`: {self.out_dim} and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.out_proj = nn.Linear(out_dim, out_dim, bias=bias)
+
+    def _qk_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _v_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        key_states: Optional[torch.Tensor] = None,
+        value_states: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = hidden_states * self.scaling
+        # get key, value proj
+        key_states = self._qk_shape(key_states, -1, bsz)
+        value_states = self._v_shape(value_states, -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        v_proj_shape = (bsz * self.num_heads, -1, self.v_head_dim)
+        query_states = self._qk_shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*v_proj_shape)
+
+        src_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.v_head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.v_head_dim)}, but is {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.v_head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.out_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+class ConditionalDETREncoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = DetrAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: torch.Tensor = None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_embeddings (`torch.FloatTensor`, *optional*): position embeddings, to be added to hidden_states.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class ConditionalDETRDecoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        d_model = config.d_model
+        # Decoder Self-Attention projections
+        self.sa_qcontent_proj = nn.Linear(d_model, d_model)
+        self.sa_qpos_proj = nn.Linear(d_model, d_model)
+        self.sa_kcontent_proj = nn.Linear(d_model, d_model)
+        self.sa_kpos_proj = nn.Linear(d_model, d_model)
+        self.sa_v_proj = nn.Linear(d_model, d_model)
+        
+        self.self_attn = ConditionalDETRAttention(
+            embed_dim=self.embed_dim,
+            out_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        
+
+        # Decoder Cross-Attention projections
+        self.ca_qcontent_proj = nn.Linear(d_model, d_model)
+        self.ca_qpos_proj = nn.Linear(d_model, d_model)
+        self.ca_kcontent_proj = nn.Linear(d_model, d_model)
+        self.ca_kpos_proj = nn.Linear(d_model, d_model)
+        self.ca_v_proj = nn.Linear(d_model, d_model)
+        self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)
+
+        self.encoder_attn = ConditionalDETRAttention(
+            self.embed_dim*2,
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.nhead = config.decoder_attention_heads
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        query_position_embeddings: Optional[torch.Tensor] = None,
+        query_sine_embed: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        is_first: Optional[bool] = False
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys
+            in the cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys
+            in the self-attention layer.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # ========== Begin of Self-Attention =============
+        # Apply projections here
+        # shape: num_queries x batch_size x 256
+        q_content = self.sa_qcontent_proj(hidden_states)      # target is the input of the first decoder layer. zero by default.
+        q_pos = self.sa_qpos_proj(query_position_embeddings)
+        k_content = self.sa_kcontent_proj(hidden_states)
+        k_pos = self.sa_kpos_proj(query_position_embeddings)
+        v = self.sa_v_proj(hidden_states)
+
+        bs, num_queries, n_model = q_content.shape
+        _, hw, _ = k_content.shape
+
+        q = q_content + q_pos
+        k = k_content + k_pos
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=q,
+            attention_mask=attention_mask,
+            key_states=k,
+            value_states=v,
+            output_attentions=output_attentions,
+        )
+        # ============ End of Self-Attention =============
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # ========== Begin of Cross-Attention =============
+        # Apply projections here
+        # shape: num_queries x batch_size x 256
+        q_content = self.ca_qcontent_proj(hidden_states)
+        k_content = self.ca_kcontent_proj(encoder_hidden_states)
+        v = self.ca_v_proj(encoder_hidden_states)
+
+        bs, num_queries, n_model = q_content.shape
+        _, hw, _ = k_content.shape
+
+        k_pos = self.ca_kpos_proj(position_embeddings)
+
+        # For the first decoder layer, we concatenate the positional embedding predicted from 
+        # the object query (the positional embedding) into the original query (key) in DETR.
+        if is_first:
+            q_pos = self.ca_qpos_proj(query_position_embeddings)
+            q = q_content + q_pos
+            k = k_content + k_pos
+        else:
+            q = q_content
+            k = k_content
+
+        q = q.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        query_sine_embed = self.ca_qpos_sine_proj(query_sine_embed)
+        query_sine_embed = query_sine_embed.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        q = torch.cat([q, query_sine_embed], dim=3).view(bs, num_queries, n_model * 2)
+        k = k.view(bs, hw, self.nhead, n_model//self.nhead)
+        k_pos = k_pos.view(bs, hw, self.nhead, n_model//self.nhead)
+        k = torch.cat([k, k_pos], dim=3).view(bs, hw, n_model * 2)
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=q,
+                attention_mask=encoder_attention_mask,
+                key_states=k,
+                value_states=v,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # ============ End of Cross-Attention =============
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead with Detr->ConditionalDETR
+class ConditionalDETRClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->ConditionalDETR
+class ConditionalDETRPreTrainedModel(PreTrainedModel):
+    config_class = ConditionalDETRConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        xavier_std = self.config.init_xavier_std
+
+        if isinstance(module, ConditionalDETRMHAttentionMap):
+            nn.init.zeros_(module.k_linear.bias)
+            nn.init.zeros_(module.q_linear.bias)
+            nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
+            nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
+        elif isinstance(module, ConditionalDETRLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, ConditionalDETRDecoder):
+            module.gradient_checkpointing = value
+
+
+CONDITIONAL_DETR_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`ConditionalDETRConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CONDITIONAL_DETR_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it.
+
+            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See [`ConditionalDETRFeatureExtractor.__call__`] for
+            details.
+
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR
+class ConditionalDETREncoder(ConditionalDETRPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`ConditionalDETREncoderLayer`].
+
+    The encoder updates the flattened feature map through multiple self-attention layers.
+
+    Small tweak for CONDITIONAL_DETR:
+
+    - position_embeddings are added to the forward pass.
+
+    Args:
+        config: ConditionalDETRConfig
+    """
+
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        self.layers = nn.ModuleList([ConditionalDETREncoderLayer(config) for _ in range(config.encoder_layers)])
+
+        # in the original CONDITIONAL_DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_embeddings=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+
+                [What are attention masks?](../glossary#attention-mask)
+
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                # we add position_embeddings as extra input to the encoder_layer
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    position_embeddings=position_embeddings,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class ConditionalDETRDecoder(ConditionalDETRPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`ConditionalDETRDecoderLayer`].
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some small tweaks for CONDITIONAL_DETR:
+
+    - position_embeddings and query_position_embeddings are added to the forward pass.
+    - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
+
+    Args:
+        config: ConditionalDETRConfig
+    """
+
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+
+        self.layers = nn.ModuleList([ConditionalDETRDecoderLayer(config) for _ in range(config.decoder_layers)])
+        # in CONDITIONAL_DETR, the decoder uses layernorm after the last decoder layer output
+        self.layernorm = nn.LayerNorm(config.d_model)
+        d_model = config.d_model
+        self.gradient_checkpointing = False
+
+        # query_scale is the FFN applied on f to generate transformation T
+        self.query_scale = MLP(d_model, d_model, d_model, 2)
+        self.ref_point_head = MLP(d_model, d_model, 2, 2)
+        for layer_id in range(config.decoder_layers - 1):
+            self.layers[layer_id + 1].ca_qpos_proj = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings=None,
+        query_position_embeddings=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`:
+
+                - 1 for queries that are **not masked**,
+                - 0 for queries that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Position embeddings that are added to the queries and keys in each cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+            input_shape = inputs_embeds.size()[:-1]
+
+        combined_attention_mask = None
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # optional intermediate hidden states
+        intermediate = () if self.config.auxiliary_loss else None
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        reference_points_before_sigmoid = self.ref_point_head(query_position_embeddings)    # [num_queries, batch_size, 2]
+        reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
+        obj_center = reference_points[..., :2].transpose(0, 1)  
+        # get sine embedding for the query vector
+        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center) 
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+            if idx == 0:
+                pos_transformation = 1
+            else:
+                pos_transformation = self.query_scale(hidden_states)  
+            # apply transformation
+            query_sine_embed = query_sine_embed_before_transformation * pos_transformation
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    combined_attention_mask,
+                    position_embeddings,
+                    query_position_embeddings,
+                    query_sine_embed,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=combined_attention_mask,
+                    position_embeddings=position_embeddings,
+                    query_position_embeddings=query_position_embeddings,
+                    query_sine_embed=query_sine_embed,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    output_attentions=output_attentions,
+                    is_first=(idx==0)
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if self.config.auxiliary_loss:
+                hidden_states = self.layernorm(hidden_states)
+                intermediate += (hidden_states,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # finally, apply layernorm
+        hidden_states = self.layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # stack intermediate decoder activations
+        if self.config.auxiliary_loss:
+            intermediate = torch.stack(intermediate)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate, reference_points]
+                if v is not None
+            )
+        return ConditionalDETRDecoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+            intermediate_hidden_states=intermediate,
+            reference_points=reference_points
+        )
+
+
+@add_start_docstrings(
+    """
+    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
+    any specific head on top.
+    """,
+    CONDITIONAL_DETR_START_DOCSTRING,
+)
+class ConditionalDETRModel(ConditionalDETRPreTrainedModel):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        # Create backbone + positional encoding
+        backbone = ConditionalDETRTimmConvEncoder(config.backbone, config.dilation)
+        position_embeddings = build_position_encoding(config)
+        self.backbone = ConditionalDETRConvModel(backbone, position_embeddings)
+
+        # Create projection layer
+        self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
+
+        self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
+
+        self.encoder = ConditionalDETREncoder(config)
+        self.decoder = ConditionalDETRDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
+
+    @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ConditionalDETRModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), device=device)
+
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # pixel_values should be of shape (batch_size, num_channels, height, width)
+        # pixel_mask should be of shape (batch_size, height, width)
+        features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+
+        # get final feature map and downsampled mask
+        feature_map, mask = features[-1]
+
+        if mask is None:
+            raise ValueError("Backbone does not return downsampled pixel mask")
+
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        projected_feature_map = self.input_projection(feature_map)
+
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
+
+        flattened_mask = mask.flatten(1)
+
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                position_embeddings=position_embeddings,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1)
+        queries = torch.zeros_like(query_position_embeddings)
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            position_embeddings=position_embeddings,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return ConditionalDETRModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            reference_points=decoder_outputs.reference_points
+        )
+
+
+@add_start_docstrings(
+    """
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
+    such as COCO detection.
+    """,
+    CONDITIONAL_DETR_START_DOCSTRING,
+)
+# Copied from transformers.models.detr.modeling_detr.DetrForObjectDetection with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
+class ConditionalDETRForObjectDetection(ConditionalDETRPreTrainedModel):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        # CONDITIONAL_DETR encoder-decoder model
+        self.model = ConditionalDETRModel(config)
+
+        # Object detection heads
+        self.class_labels_classifier = nn.Linear(
+            config.d_model, config.num_labels
+        )  # We add one for the "no object" class
+        self.bbox_predictor = ConditionalDETRMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+
+    @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ConditionalDETRObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForObjectDetection
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> # model predicts bounding boxes and corresponding COCO classes
+        >>> logits = outputs.logits
+        >>> bboxes = outputs.pred_boxes
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # First, sent images through CONDITIONAL_DETR base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        # class logits + predicted bounding boxes
+        logits = self.class_labels_classifier(sequence_output)
+
+        reference = outputs.reference_points if return_dict else outputs[-1]
+        reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
+        outputs_coords = []
+        hs = sequence_output
+        tmp = self.bbox_predictor(hs)
+        tmp[..., :2] += reference_before_sigmoid
+        pred_boxes = tmp.sigmoid()
+        # pred_boxes = self.bbox_predictor(sequence_output).sigmoid()
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = ConditionalDETRHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality"]
+            criterion = ConditionalDETRLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                focal_alpha=self.config.focal_alpha,
+                losses=losses,
+            )
+            criterion.to(self.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            if self.config.auxiliary_loss:
+                intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4]
+                outputs_class = self.class_labels_classifier(intermediate)
+
+                for lvl in range(hs.shape[0]):
+                    tmp = self.bbox_predictor(hs[lvl])
+                    tmp[..., :2] += reference_before_sigmoid
+                    outputs_coord = tmp.sigmoid()
+                    outputs_coords.append(outputs_coord)
+                outputs_coord = torch.stack(outputs_coords)
+
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": self.config.cls_loss_coefficient, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return ConditionalDETRObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
+    such as COCO panoptic.
+
+    """,
+    CONDITIONAL_DETR_START_DOCSTRING,
+)
+# Copied from transformers.models.detr.modeling_detr.DetrForSegmentation with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
+class ConditionalDETRForSegmentation(ConditionalDETRPreTrainedModel):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        # object detection model
+        self.conditional_detr = ConditionalDETRForObjectDetection(config)
+
+        # segmentation head
+        hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
+        intermediate_channel_sizes = self.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes
+
+        self.mask_head = ConditionalDETRMaskHeadSmallConv(
+            hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
+        )
+
+        self.bbox_attention = ConditionalDETRMHAttentionMap(
+            hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ConditionalDETRSegmentationOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each
+            dictionary containing at least the following 3 keys: 'class_labels', 'boxes' and 'masks' (the class labels,
+            bounding boxes and segmentation masks of an image in the batch respectively). The class labels themselves
+            should be a `torch.LongTensor` of len `(number of bounding boxes in the image,)`, the boxes a
+            `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)` and the masks a
+            `torch.FloatTensor` of shape `(number of bounding boxes in the image, height, width)`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
+        >>> model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> # model predicts COCO classes, bounding boxes, and masks
+        >>> logits = outputs.logits
+        >>> bboxes = outputs.pred_boxes
+        >>> masks = outputs.pred_masks
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones((batch_size, height, width), device=device)
+
+        # First, get list of feature maps and position embeddings
+        features, position_embeddings_list = self.conditional_detr.model.backbone(pixel_values, pixel_mask=pixel_mask)
+
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        feature_map, mask = features[-1]
+        batch_size, num_channels, height, width = feature_map.shape
+        projected_feature_map = self.conditional_detr.model.input_projection(feature_map)
+
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
+
+        flattened_mask = mask.flatten(1)
+
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.conditional_detr.model.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                position_embeddings=position_embeddings,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.conditional_detr.model.query_position_embeddings.weight.unsqueeze(0).repeat(
+            batch_size, 1, 1
+        )
+        queries = torch.zeros_like(query_position_embeddings)
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.conditional_detr.model.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            position_embeddings=position_embeddings,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        # Sixth, compute logits, pred_boxes and pred_masks
+        logits = self.conditional_detr.class_labels_classifier(sequence_output)
+        pred_boxes = self.conditional_detr.bbox_predictor(sequence_output).sigmoid()
+
+        memory = encoder_outputs[0].permute(0, 2, 1).view(batch_size, self.config.d_model, height, width)
+        mask = flattened_mask.view(batch_size, height, width)
+
+        # FIXME h_boxes takes the last one computed, keep this in mind
+        # important: we need to reverse the mask, since in the original implementation the mask works reversed
+        # bbox_mask is of shape (batch_size, num_queries, number_of_attention_heads in bbox_attention, height/32, width/32)
+        bbox_mask = self.bbox_attention(sequence_output, memory, mask=~mask)
+
+        seg_masks = self.mask_head(projected_feature_map, bbox_mask, [features[2][0], features[1][0], features[0][0]])
+
+        pred_masks = seg_masks.view(batch_size, self.conditional_detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = ConditionalDETRHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality", "masks"]
+            criterion = ConditionalDETRLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                focal_alpha=self.config.focal_alpha,
+                losses=losses,
+            )
+            criterion.to(self.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            outputs_loss["pred_masks"] = pred_masks
+            if self.config.auxiliary_loss:
+                intermediate = decoder_outputs.intermediate_hidden_states if return_dict else decoder_outputs[-1]
+                outputs_class = self.class_labels_classifier(intermediate)
+                outputs_coord = self.bbox_predictor(intermediate).sigmoid()
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            weight_dict["loss_mask"] = self.config.mask_loss_coefficient
+            weight_dict["loss_dice"] = self.config.dice_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes, pred_masks) + auxiliary_outputs + decoder_outputs + encoder_outputs
+            else:
+                output = (logits, pred_boxes, pred_masks) + decoder_outputs + encoder_outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return ConditionalDETRSegmentationOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            pred_masks=pred_masks,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+def _expand(tensor, length: int):
+    return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1)
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/segmentation.py
+# Copied from transformers.models.detr.modeling_detr.DetrMaskHeadSmallConv with Detr->ConditionalDETR
+class ConditionalDETRMaskHeadSmallConv(nn.Module):
+    """
+    Simple convolutional head, using group norm. Upsampling is done using a FPN approach
+    """
+
+    def __init__(self, dim, fpn_dims, context_dim):
+        super().__init__()
+
+        if dim % 8 != 0:
+            raise ValueError(
+                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in GroupNorm is set to 8"
+            )
+
+        inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
+
+        self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
+        self.gn1 = nn.GroupNorm(8, dim)
+        self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
+        self.gn2 = nn.GroupNorm(8, inter_dims[1])
+        self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
+        self.gn3 = nn.GroupNorm(8, inter_dims[2])
+        self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
+        self.gn4 = nn.GroupNorm(8, inter_dims[3])
+        self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
+        self.gn5 = nn.GroupNorm(8, inter_dims[4])
+        self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)
+
+        self.dim = dim
+
+        self.adapter1 = nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
+        self.adapter2 = nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
+        self.adapter3 = nn.Conv2d(fpn_dims[2], inter_dims[3], 1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
+        # here we concatenate x, the projected feature map, of shape (batch_size, d_model, heigth/32, width/32) with
+        # the bbox_mask = the attention maps of shape (batch_size, n_queries, n_heads, height/32, width/32).
+        # We expand the projected feature map to match the number of heads.
+        x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
+
+        x = self.lay1(x)
+        x = self.gn1(x)
+        x = nn.functional.relu(x)
+        x = self.lay2(x)
+        x = self.gn2(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter1(fpns[0])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay3(x)
+        x = self.gn3(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter2(fpns[1])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay4(x)
+        x = self.gn4(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter3(fpns[2])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay5(x)
+        x = self.gn5(x)
+        x = nn.functional.relu(x)
+
+        x = self.out_lay(x)
+        return x
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->ConditionalDETR
+class ConditionalDETRMHAttentionMap(nn.Module):
+    """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
+
+    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.dropout = nn.Dropout(dropout)
+
+        self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+        self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+
+        self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5
+
+    def forward(self, q, k, mask: Optional[Tensor] = None):
+        q = self.q_linear(q)
+        k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
+        queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
+        keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
+        weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)
+
+        if mask is not None:
+            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf"))
+        weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
+        weights = self.dropout(weights)
+        return weights
+
+
+def dice_loss(inputs, targets, num_boxes):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_boxes
+
+
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+# taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
+class ConditionalDETRLoss(nn.Module):
+    """
+    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process happens in two steps: 1)
+    we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
+    of matched ground-truth / prediction (supervise class and box).
+
+
+
+    Args:
+        matcher (`ConditionalDETRHungarianMatcher`):
+            Module able to compute a matching between targets and proposals.
+        num_classes (`int`):
+            Number of object categories, omitting the special no-object category.
+        focal_alpha (`float`):
+            Alpha parmeter in focal loss.
+        losses (`List[str]`):
+            List of all the losses to be applied. See `get_loss` for a list of all available losses.
+    """
+
+    def __init__(self, matcher, num_classes, focal_alpha, losses):
+        super().__init__()
+        self.matcher = matcher
+        self.num_classes = num_classes
+        self.focal_alpha = focal_alpha
+        self.losses = losses
+
+    # removed logging parameter, which was part of the original implementation
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor of dim
+        [nb_target_boxes]
+        """
+        if "logits" not in outputs:
+            raise KeyError("No logits were found in the outputs")
+        src_logits = outputs["logits"]
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_o
+
+        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2]+1],
+                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:,:,:-1]
+        loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) * src_logits.shape[1]
+        losses = {"loss_ce": loss_ce}
+
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
+
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+        """
+        logits = outputs["logits"]
+        device = logits.device
+        tgt_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+        card_err = nn.functional.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
+
+        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
+        are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        if "pred_boxes" not in outputs:
+            raise KeyError("No predicted boxes found in outputs")
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        loss_bbox = nn.functional.l1_loss(src_boxes, target_boxes, reduction="none")
+
+        losses = {}
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(
+            generalized_box_iou(center_to_corners_format(src_boxes), center_to_corners_format(target_boxes))
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the masks: the focal loss and the dice loss.
+
+        Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
+        """
+        if "pred_masks" not in outputs:
+            raise KeyError("No predicted masks found in outputs")
+
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+
+        # upsample predictions to the target size
+        src_masks = nn.functional.interpolate(
+            src_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
+        )
+        src_masks = src_masks[:, 0].flatten(1)
+
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(src_masks.shape)
+        losses = {
+            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
+            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
+        }
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+            "masks": self.loss_masks,
+        }
+        if loss not in loss_map:
+            raise ValueError(f"Loss {loss} not supported")
+        return loss_map[loss](outputs, targets, indices, num_boxes)
+
+    def forward(self, outputs, targets):
+        """
+        This performs the loss computation.
+
+        Args:
+             outputs (`dict`, *optional*):
+                Dictionary of tensors, see the output specification of the model for the format.
+             targets (`List[dict]`, *optional*):
+                List of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the
+                losses applied, see each loss' doc.
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["class_labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        # (Niels): comment out function below, distributed training to be added
+        # if is_dist_avail_and_initialized():
+        #     torch.distributed.all_reduce(num_boxes)
+        # (Niels) in original implementation, num_boxes is divided by get_world_size()
+        num_boxes = torch.clamp(num_boxes, min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "auxiliary_outputs" in outputs:
+            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
+                indices = self.matcher(auxiliary_outputs, targets)
+                for loss in self.losses:
+                    if loss == "masks":
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->ConditionalDETR,detr->conditional_detr
+class ConditionalDETRMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+# taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/matcher.py
+class ConditionalDETRHungarianMatcher(nn.Module):
+    """
+    This class computes an assignment between the targets and the predictions of the network.
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
+    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+
+    Args:
+        class_cost:
+            The relative weight of the classification error in the matching cost.
+        bbox_cost:
+            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
+        giou_cost:
+            The relative weight of the giou loss of the bounding box in the matching cost.
+    """
+
+    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
+        super().__init__()
+        requires_backends(self, ["scipy"])
+
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        if class_cost == 0 or bbox_cost == 0 or giou_cost == 0:
+            raise ValueError("All costs of the Matcher can't be 0")
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Args:
+            outputs (`dict`):
+                A dictionary that contains at least these entries:
+                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
+            targets (`List[dict]`):
+                A list of targets (len(targets) = batch_size), where each target is a dict containing:
+                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
+                  ground-truth
+                 objects in the target) containing the class labels
+                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
+
+        Returns:
+            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
+            - index_i is the indices of the selected predictions (in order)
+            - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v["class_labels"] for v in targets])
+        tgt_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost.
+        alpha = 0.25
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        class_cost = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
+
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, tgt_bbox, p=1)
+
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(tgt_bbox))
+
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+# below: bounding box utilities taken from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+
+
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
+
+    Args:
+        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
+            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
+            < x2` and `0 <= y1 < y2`.
+
+    Returns:
+        `torch.FloatTensor`: a tensor containing the area for each box.
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
+    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
+
+    Returns:
+        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = box_iou(boxes1, boxes2)
+
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+# below: taken from https://github.com/facebookresearch/detr/blob/master/util/misc.py#L306
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    if tensor_list[0].ndim == 3:
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("Only 3-dimensional tensors are supported")
+    return NestedTensor(tensor, mask)
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index e1f4f3b1fd9fa..f61b8dfdda282 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -24,6 +24,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class ConditionalDETRFeatureExtractor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ConvNextFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/conditional_detr/__init__.py b/tests/models/conditional_detr/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
new file mode 100644
index 0000000000000..b5e19fe005da7
--- /dev/null
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -0,0 +1,338 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import pathlib
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ConditionalDETRFeatureExtractor
+
+
+class ConditionalDETRFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=18,
+        max_size=1333,  # by setting max_size > max_resolution we're effectively not testing this :p
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.max_size = max_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "max_size": self.max_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to ConditionalDETRFeatureExtractor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size * h / w)
+                expected_width = self.size
+            elif w > h:
+                expected_height = self.size
+                expected_width = int(self.size * w / h)
+            else:
+                expected_height = self.size
+                expected_width = self.size
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+
+@require_torch
+@require_vision
+class ConditionalDETRFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+
+    feature_extraction_class = ConditionalDETRFeatureExtractor if is_vision_available() else None
+
+    def setUp(self):
+        self.feature_extract_tester = ConditionalDETRFeatureExtractionTester(self)
+
+    @property
+    def feat_extract_dict(self):
+        return self.feature_extract_tester.prepare_feat_extract_dict()
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "image_mean"))
+        self.assertTrue(hasattr(feature_extractor, "image_std"))
+        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
+        self.assertTrue(hasattr(feature_extractor, "do_resize"))
+        self.assertTrue(hasattr(feature_extractor, "size"))
+        self.assertTrue(hasattr(feature_extractor, "max_size"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PIL images
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random numpy tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_equivalence_pad_and_create_pixel_mask(self):
+        # Initialize feature_extractors
+        feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
+        feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test whether the method "pad_and_return_pixel_mask" and calling the feature extractor return the same tensors
+        encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
+        encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
+
+        assert torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
+        assert torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
+
+    @slow
+    def test_call_pytorch_with_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"image_id": 39769, "annotations": target}
+
+        # encode them
+        feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+
+        # verify area
+        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
+        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+        assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+        # verify class_labels
+        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+        assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
+
+    @slow
+    def test_call_pytorch_with_coco_panoptic_annotations(self):
+        # prepare image, target and masks_path
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+
+        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
+
+        # encode them
+        # TODO replace by .from_pretrained Atten4Vis/ConditionalDETR-panoptic
+        feature_extractor = ConditionalDETRFeatureExtractor(format="coco_panoptic")
+        encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+
+        # verify area
+        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
+        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+        assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+        # verify class_labels
+        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+        assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)
+        # verify masks
+        expected_masks_sum = 822338
+        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
new file mode 100644
index 0000000000000..38933f26c6627
--- /dev/null
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -0,0 +1,538 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch CONDITIONAL_DETR model. """
+
+
+import inspect
+import math
+import unittest
+
+from transformers import ConditionalDETRConfig, is_timm_available, is_vision_available
+from transformers.testing_utils import require_timm, require_vision, slow, torch_device
+from transformers.utils import cached_property
+
+from ...generation.test_generation_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+
+
+if is_timm_available():
+    import torch
+
+    from transformers import ConditionalDETRForObjectDetection, ConditionalDETRForSegmentation, ConditionalDETRModel
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ConditionalDETRFeatureExtractor
+
+
+class ConditionalDETRModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=8,
+        is_training=True,
+        use_labels=True,
+        hidden_size=256,
+        num_hidden_layers=2,
+        num_attention_heads=8,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        num_queries=12,
+        num_channels=3,
+        min_size=200,
+        max_size=200,
+        n_targets=8,
+        num_labels=91,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_queries = num_queries
+        self.num_channels = num_channels
+        self.min_size = min_size
+        self.max_size = max_size
+        self.n_targets = n_targets
+        self.num_labels = num_labels
+
+        # we also set the expected seq length for both encoder and decoder
+        self.encoder_seq_length = math.ceil(self.min_size / 32) * math.ceil(self.max_size / 32)
+        self.decoder_seq_length = self.num_queries
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size])
+
+        pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device)
+
+        labels = None
+        if self.use_labels:
+            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
+            labels = []
+            for i in range(self.batch_size):
+                target = {}
+                target["class_labels"] = torch.randint(
+                    high=self.num_labels, size=(self.n_targets,), device=torch_device
+                )
+                target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device)
+                target["masks"] = torch.rand(self.n_targets, self.min_size, self.max_size, device=torch_device)
+                labels.append(target)
+
+        config = self.get_config()
+        return config, pixel_values, pixel_mask, labels
+
+    def get_config(self):
+        return ConditionalDETRConfig(
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            num_queries=self.num_queries,
+            num_labels=self.num_labels,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
+        return config, inputs_dict
+
+    def create_and_check_conditional_detr_model(self, config, pixel_values, pixel_mask, labels):
+        model = ConditionalDETRModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_conditional_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
+        model = ConditionalDETRForObjectDetection(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
+
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+
+@require_timm
+class ConditionalDETRModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            ConditionalDETRModel,
+            ConditionalDETRForObjectDetection,
+            ConditionalDETRForSegmentation,
+        )
+        if is_timm_available()
+        else ()
+    )
+    is_encoder_decoder = True
+    test_torchscript = False
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+
+    # special case for head models
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ in ["ConditionalDETRForObjectDetection", "ConditionalDETRForSegmentation"]:
+                labels = []
+                for i in range(self.model_tester.batch_size):
+                    target = {}
+                    target["class_labels"] = torch.ones(
+                        size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long
+                    )
+                    target["boxes"] = torch.ones(
+                        self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float
+                    )
+                    target["masks"] = torch.ones(
+                        self.model_tester.n_targets,
+                        self.model_tester.min_size,
+                        self.model_tester.max_size,
+                        device=torch_device,
+                        dtype=torch.float,
+                    )
+                    labels.append(target)
+                inputs_dict["labels"] = labels
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = ConditionalDETRModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ConditionalDETRConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_conditional_detr_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_conditional_detr_model(*config_and_inputs)
+
+    def test_conditional_detr_object_detection_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_conditional_detr_object_detection_head_model(*config_and_inputs)
+
+    @unittest.skip(reason="CONDITIONAL_DETR does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="CONDITIONAL_DETR does not have a get_input_embeddings method")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="CONDITIONAL_DETR is not a generative model")
+    def test_generate_without_input_ids(self):
+        pass
+
+    @unittest.skip(reason="CONDITIONAL_DETR does not use token embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    @slow
+    def test_model_outputs_equivalence(self):
+        # TODO Niels: fix me!
+        pass
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        decoder_seq_length = self.model_tester.decoder_seq_length
+        encoder_seq_length = self.model_tester.encoder_seq_length
+        decoder_key_length = self.model_tester.decoder_seq_length
+        encoder_key_length = self.model_tester.encoder_seq_length
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            if self.is_encoder_decoder:
+                correct_outlen = 6
+
+                # loss is at first position
+                if "labels" in inputs_dict:
+                    correct_outlen += 1  # loss is added to beginning
+                # Object Detection model returns pred_logits and pred_boxes
+                if model_class.__name__ == "ConditionalDETRForObjectDetection":
+                    correct_outlen += 1
+                # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks
+                if model_class.__name__ == "ConditionalDETRForSegmentation":
+                    correct_outlen += 2
+                if "past_key_values" in outputs:
+                    correct_outlen += 1  # past_key_values have been returned
+
+                self.assertEqual(out_len, correct_outlen)
+
+                # decoder attentions
+                decoder_attentions = outputs.decoder_attentions
+                self.assertIsInstance(decoder_attentions, (list, tuple))
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(decoder_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                )
+
+                # cross attentions
+                cross_attentions = outputs.cross_attentions
+                self.assertIsInstance(cross_attentions, (list, tuple))
+                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(cross_attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        decoder_seq_length,
+                        encoder_key_length,
+                    ],
+                )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        output = outputs[0]
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_attentions = outputs.encoder_attentions[0]
+        encoder_hidden_states.retain_grad()
+        encoder_attentions.retain_grad()
+
+        decoder_attentions = outputs.decoder_attentions[0]
+        decoder_attentions.retain_grad()
+
+        cross_attentions = outputs.cross_attentions[0]
+        cross_attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(encoder_attentions.grad)
+        self.assertIsNotNone(decoder_attentions.grad)
+        self.assertIsNotNone(cross_attentions.grad)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model.config.is_encoder_decoder:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                expected_arg_names.extend(
+                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
+                    if "head_mask" and "decoder_head_mask" in arg_names
+                    else []
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+            else:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_different_timm_backbone(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # let's pick a random timm backbone
+        config.backbone = "tf_mobilenetv3_small_075"
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if model_class.__name__ == "ConditionalDETRForObjectDetection":
+                expected_shape = (
+                    self.model_tester.batch_size,
+                    self.model_tester.num_queries,
+                    self.model_tester.num_labels,
+                )
+                self.assertEqual(outputs.logits.shape, expected_shape)
+
+            self.assertTrue(outputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        configs_no_init.init_xavier_std = 1e9
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if "bbox_attention" in name and "bias" not in name:
+                        self.assertLess(
+                            100000,
+                            abs(param.data.max().item()),
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+
+TOLERANCE = 1e-4
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_timm
+@require_vision
+@slow
+class ConditionalDETRModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return (
+            ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+            if is_vision_available()
+            else None
+        )
+
+    def test_inference_no_head(self):
+        model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**encoding)
+
+        expected_shape = torch.Size((1, 300, 256))
+        assert outputs.last_hidden_state.shape == expected_shape
+        expected_slice = torch.tensor(
+            [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
+    def test_inference_object_detection_head(self):
+        model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+        expected_slice_logits = torch.tensor(
+            [[-19.1194, -0.0893, -11.0154], [-17.3640, -1.8035, -14.0219], [-20.0461, -0.5837, -11.1060]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        expected_slice_boxes = torch.tensor(
+            [[0.4433, 0.5302, 0.8853], [0.5494, 0.2517, 0.0529], [0.4998, 0.5360, 0.9956]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+    def test_inference_panoptic_segmentation_head(self):
+        model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+        expected_slice_logits = torch.tensor(
+            [[-18.1565, -1.7568, -13.5029], [-16.8888, -1.4138, -14.1028], [-17.5709, -2.5080, -11.8654]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        expected_slice_boxes = torch.tensor(
+            [[0.5344, 0.1789, 0.9285], [0.4420, 0.0572, 0.0875], [0.6630, 0.6887, 0.1017]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+        expected_shape_masks = torch.Size((1, model.config.num_queries, 200, 267))
+        self.assertEqual(outputs.pred_masks.shape, expected_shape_masks)
+        expected_slice_masks = torch.tensor(
+            [[-7.7558, -10.8788, -11.9797], [-11.8881, -16.4329, -17.7451], [-14.7316, -19.7383, -20.3004]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_masks[0, 0, :3, :3], expected_slice_masks, atol=1e-3))

From e28dd8ea3cb921c91506b879b29dc2ea7c52abe8 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:54:36 -0400
Subject: [PATCH 099/233] checked copies

---
 docs/source/en/index.mdx                      |  2 +-
 .../modeling_conditional_detr.py              | 63 ++++++++++++-------
 2 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 7830c57e6eabd..84d489e22ddf1 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -68,7 +68,7 @@ The documentation is organized into five sections:
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[conditional_detr](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 1f63d22be27a5..4ce1f8f3d5776 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -153,8 +153,8 @@ class ConditionalDETRObjectDetectionOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
-            boxes.
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            unnormalized bounding boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
@@ -217,12 +217,13 @@ class ConditionalDETRSegmentationOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
-            boxes.
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            unnormalized bounding boxes.
         pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
-            Segmentation masks logits for all queries. See also [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
-            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation masks
-            respectively.
+            Segmentation masks logits for all queries. See also
+            [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
+            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation
+            masks respectively.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
@@ -326,7 +327,6 @@ def replace_batch_norm(m, name=""):
         replace_batch_norm(ch, n)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrTimmConvEncoder with Detr->ConditionalDETR
 class ConditionalDETRTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.
@@ -1581,7 +1581,6 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-# Copied from transformers.models.detr.modeling_detr.DetrForObjectDetection with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
 class ConditionalDETRForObjectDetection(ConditionalDETRPreTrainedModel):
     def __init__(self, config: ConditionalDETRConfig):
         super().__init__(config)
@@ -1670,7 +1669,7 @@ def forward(
 
         # class logits + predicted bounding boxes
         logits = self.class_labels_classifier(sequence_output)
-
+        
         reference = outputs.reference_points if return_dict else outputs[-1]
         reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
         outputs_coords = []
@@ -1755,7 +1754,6 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-# Copied from transformers.models.detr.modeling_detr.DetrForSegmentation with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
 class ConditionalDETRForSegmentation(ConditionalDETRPreTrainedModel):
     def __init__(self, config: ConditionalDETRConfig):
         super().__init__(config)
@@ -1807,22 +1805,40 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
-        >>> from PIL import Image
+        >>> import io
         >>> import requests
+        >>> from PIL import Image
+        >>> import torch
+        >>> import numpy
+
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
+        >>> from transformers.models.conditional_detr.feature_extraction_conditional_detr import rgb_to_id
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
-        >>> model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained(
+        ...     "facebook/conditional_detr-resnet-50-panoptic"
+        ... )
+        >>> model = ConditionalDETRForSegmentation.from_pretrained("facebook/conditional_detr-resnet-50-panoptic")
 
+        >>> # prepare image for the model
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
+
+        >>> # forward pass
         >>> outputs = model(**inputs)
-        >>> # model predicts COCO classes, bounding boxes, and masks
-        >>> logits = outputs.logits
-        >>> bboxes = outputs.pred_boxes
-        >>> masks = outputs.pred_masks
+
+        >>> # use the `post_process_panoptic` method of `ConditionalDETRFeatureExtractor` to convert to COCO format
+        >>> processed_sizes = torch.as_tensor(inputs["pixel_values"].shape[-2:]).unsqueeze(0)
+        >>> result = feature_extractor.post_process_panoptic(outputs, processed_sizes)[0]
+
+        >>> # the segmentation is stored in a special-format png
+        >>> panoptic_seg = Image.open(io.BytesIO(result["png_string"]))
+        >>> panoptic_seg = numpy.array(panoptic_seg, dtype=numpy.uint8)
+        >>> # retrieve the ids corresponding to each mask
+        >>> panoptic_seg_id = rgb_to_id(panoptic_seg)
+        >>> panoptic_seg_id.shape
+        (800, 1066)
         ```"""
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1903,7 +1919,9 @@ def forward(
 
         seg_masks = self.mask_head(projected_feature_map, bbox_mask, [features[2][0], features[1][0], features[0][0]])
 
-        pred_masks = seg_masks.view(batch_size, self.conditional_detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+        pred_masks = seg_masks.view(
+            batch_size, self.conditional_detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1]
+        )
 
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
@@ -1985,7 +2003,8 @@ def __init__(self, dim, fpn_dims, context_dim):
 
         if dim % 8 != 0:
             raise ValueError(
-                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in GroupNorm is set to 8"
+                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in"
+                " GroupNorm is set to 8"
             )
 
         inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
@@ -2077,7 +2096,7 @@ def forward(self, q, k, mask: Optional[Tensor] = None):
         weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)
 
         if mask is not None:
-            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf"))
+            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min)
         weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
         weights = self.dropout(weights)
         return weights

From 54562627446de301792de8e3ce0b75160f86fddf Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:55:14 -0400
Subject: [PATCH 100/233] checked copies

---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 4ce1f8f3d5776..1473df759cd8e 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -2347,7 +2347,6 @@ def forward(self, outputs, targets):
 
 
 # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
-# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->ConditionalDETR,detr->conditional_detr
 class ConditionalDETRMLPPredictionHead(nn.Module):
     """
     Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,

From d41f433d3f734beb8d7dd68ac1de78e82eebb6f6 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 01:22:01 -0400
Subject: [PATCH 101/233] fixed style and copies

---
 docs/source/en/_toctree.yml                   |   8 +-
 .../configuration_conditional_detr.py         |   8 -
 ..._original_pytorch_checkpoint_to_pytorch.py |  54 +++++--
 .../feature_extraction_conditional_detr.py    |   4 +-
 .../modeling_conditional_detr.py              | 144 +++++++++++-------
 5 files changed, 133 insertions(+), 85 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index c21388b60a6f9..f6d6a94eee334 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -42,7 +42,8 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - sections:
+    - isExpanded: false
+      sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -58,7 +59,6 @@
       - local: tasks/multiple_choice
         title: Multiple choice
       title: Task guides
-      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
@@ -362,6 +362,8 @@
       sections:
       - local: model_doc/beit
         title: BEiT
+      - local: model_doc/conditional_detr
+        title: ConditionalDETR
       - local: model_doc/convnext
         title: ConvNeXT
       - local: model_doc/cvt
@@ -501,4 +503,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
\ No newline at end of file
+  title: API
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index ce004dc6fc15c..bfb759ca246ca 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -211,14 +211,6 @@ def num_attention_heads(self) -> int:
     def hidden_size(self) -> int:
         return self.d_model
 
-    @property
-    def num_attention_heads(self) -> int:
-        return self.encoder_attention_heads
-
-    @property
-    def hidden_size(self) -> int:
-        return self.d_model
-
 
 class ConditionalDETROnnxConfig(OnnxConfig):
 
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index 2d7abc3c088b5..4b56d729d1f35 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -94,29 +94,55 @@
     rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
 
     # q, k, v projections in self/cross-attention in decoder for conditional DETR
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.weight", f"decoder.layers.{i}.sa_v_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight")
+    )
     # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.weight", f"decoder.layers.{i}.ca_qpos_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.weight", f"decoder.layers.{i}.ca_v_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight")
+    )
 
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.bias", f"decoder.layers.{i}.sa_qpos_proj.bias"))
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.bias", f"decoder.layers.{i}.sa_kpos_proj.bias"))
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.bias", f"decoder.layers.{i}.sa_v_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias")
+    )
     # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.bias", f"decoder.layers.{i}.ca_qpos_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.bias", f"decoder.layers.{i}.ca_kpos_proj.bias"))
     rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.bias", f"decoder.layers.{i}.ca_v_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias")
+    )
 
 # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
 # for conditional DETR, also convert reference point head and query scale MLP
@@ -144,7 +170,7 @@
         ("transformer.decoder.query_scale.layers.1.weight", "decoder.query_scale.layers.1.weight"),
         ("transformer.decoder.query_scale.layers.1.bias", "decoder.query_scale.layers.1.bias"),
         ("transformer.decoder.layers.0.ca_qpos_proj.weight", "decoder.layers.0.ca_qpos_proj.weight"),
-        ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias")
+        ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias"),
     ]
 )
 
diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 07550124c157e..f37e3e1fb69bf 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -704,8 +704,8 @@ def post_process(self, outputs, target_sizes):
         topk_boxes = topk_indexes // out_logits.shape[2]
         labels = topk_indexes % out_logits.shape[2]
         boxes = center_to_corners_format(out_bbox)
-        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
-        
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
         # and from relative [0, 1] to absolute [0, height] coordinates
         img_h, img_w = target_sizes.unbind(1)
         scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 1473df759cd8e..54dea544f3da9 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -62,13 +62,13 @@
 ]
 
 
-
 @dataclass
 class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
     """
-    Base class for outputs of the Conditional DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
-    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
-    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+    Base class for outputs of the Conditional DETR decoder. This class adds one attribute to
+    BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output
+    of each decoder layer, each of them gone through a layernorm. This is useful when training the model with auxiliary
+    decoding losses.
 
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -97,9 +97,10 @@ class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
 @dataclass
 class ConditionalDETRModelOutput(Seq2SeqModelOutput):
     """
-    Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
-    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
-    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+    Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to
+    Seq2SeqModelOutput, namely an optional stack of intermediate decoder activations, i.e. the output of each decoder
+    layer, each of them gone through a layernorm. This is useful when training the model with auxiliary decoding
+    losses.
 
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -389,6 +390,7 @@ def forward(self, pixel_values, pixel_mask):
 
         return out, pos
 
+
 # Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDETR
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
@@ -479,6 +481,7 @@ def build_position_encoding(config):
 
     return position_embedding
 
+
 # function to generate sine positional embedding for 2d coordinates
 def gen_sineembed_for_position(pos_tensor):
     scale = 2 * math.pi
@@ -493,11 +496,13 @@ def gen_sineembed_for_position(pos_tensor):
     pos = torch.cat((pos_y, pos_x), dim=2)
     return pos
 
+
 def inverse_sigmoid(x, eps=1e-5):
     x = x.clamp(min=0, max=1)
     x1 = x.clamp(min=eps)
     x2 = (1 - x).clamp(min=eps)
-    return torch.log(x1/x2)
+    return torch.log(x1 / x2)
+
 
 class DetrAttention(nn.Module):
     """
@@ -521,7 +526,8 @@ def __init__(
         self.head_dim = embed_dim // num_heads
         if self.head_dim * num_heads != self.embed_dim:
             raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
             )
         self.scaling = self.head_dim**-0.5
 
@@ -585,7 +591,8 @@ def forward(
 
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
             )
 
         if attention_mask is not None:
@@ -614,7 +621,8 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
             )
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
@@ -625,12 +633,13 @@ def forward(
 
         return attn_output, attn_weights_reshaped
 
+
 class ConditionalDETRAttention(nn.Module):
     """
     Cross-Attention used in Conditional DETR 'Conditional DETR for Fast Training Convergence' paper.
 
-    The key q_proj, k_proj, v_proj are defined outside the attention.
-    This attention allows the dim of q, k to be different to v.
+    The key q_proj, k_proj, v_proj are defined outside the attention. This attention allows the dim of q, k to be
+    different to v.
     """
 
     def __init__(
@@ -649,7 +658,8 @@ def __init__(
         self.head_dim = embed_dim // num_heads
         if self.head_dim * num_heads != self.embed_dim:
             raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
             )
         # head dimension of values
         self.v_head_dim = out_dim // num_heads
@@ -663,6 +673,7 @@ def __init__(
 
     def _qk_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
     def _v_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim).transpose(1, 2).contiguous()
 
@@ -696,7 +707,8 @@ def forward(
 
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
             )
 
         if attention_mask is not None:
@@ -725,7 +737,8 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.v_head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.v_head_dim)}, but is {attn_output.size()}"
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.v_head_dim)}, but is"
+                f" {attn_output.size()}"
             )
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.v_head_dim)
@@ -818,19 +831,18 @@ def __init__(self, config: ConditionalDETRConfig):
         self.sa_kcontent_proj = nn.Linear(d_model, d_model)
         self.sa_kpos_proj = nn.Linear(d_model, d_model)
         self.sa_v_proj = nn.Linear(d_model, d_model)
-        
+
         self.self_attn = ConditionalDETRAttention(
             embed_dim=self.embed_dim,
             out_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
-            dropout=config.attention_dropout
+            dropout=config.attention_dropout,
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        
 
         # Decoder Cross-Attention projections
         self.ca_qcontent_proj = nn.Linear(d_model, d_model)
@@ -841,10 +853,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)
 
         self.encoder_attn = ConditionalDETRAttention(
-            self.embed_dim*2,
-            self.embed_dim,
-            config.decoder_attention_heads,
-            dropout=config.attention_dropout
+            self.embed_dim * 2, self.embed_dim, config.decoder_attention_heads, dropout=config.attention_dropout
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
@@ -862,7 +871,7 @@ def forward(
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
-        is_first: Optional[bool] = False
+        is_first: Optional[bool] = False,
     ):
         """
         Args:
@@ -888,7 +897,9 @@ def forward(
         # ========== Begin of Self-Attention =============
         # Apply projections here
         # shape: num_queries x batch_size x 256
-        q_content = self.sa_qcontent_proj(hidden_states)      # target is the input of the first decoder layer. zero by default.
+        q_content = self.sa_qcontent_proj(
+            hidden_states
+        )  # target is the input of the first decoder layer. zero by default.
         q_pos = self.sa_qpos_proj(query_position_embeddings)
         k_content = self.sa_kcontent_proj(hidden_states)
         k_pos = self.sa_kpos_proj(query_position_embeddings)
@@ -924,7 +935,7 @@ def forward(
 
         k_pos = self.ca_kpos_proj(position_embeddings)
 
-        # For the first decoder layer, we concatenate the positional embedding predicted from 
+        # For the first decoder layer, we concatenate the positional embedding predicted from
         # the object query (the positional embedding) into the original query (key) in DETR.
         if is_first:
             q_pos = self.ca_qpos_proj(query_position_embeddings)
@@ -934,12 +945,12 @@ def forward(
             q = q_content
             k = k_content
 
-        q = q.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        q = q.view(bs, num_queries, self.nhead, n_model // self.nhead)
         query_sine_embed = self.ca_qpos_sine_proj(query_sine_embed)
-        query_sine_embed = query_sine_embed.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        query_sine_embed = query_sine_embed.view(bs, num_queries, self.nhead, n_model // self.nhead)
         q = torch.cat([q, query_sine_embed], dim=3).view(bs, num_queries, n_model * 2)
-        k = k.view(bs, hw, self.nhead, n_model//self.nhead)
-        k_pos = k_pos.view(bs, hw, self.nhead, n_model//self.nhead)
+        k = k.view(bs, hw, self.nhead, n_model // self.nhead)
+        k_pos = k_pos.view(bs, hw, self.nhead, n_model // self.nhead)
         k = torch.cat([k, k_pos], dim=3).view(bs, hw, n_model * 2)
 
         # Cross-Attention Block
@@ -996,8 +1007,9 @@ def forward(self, hidden_states: torch.Tensor):
         hidden_states = self.out_proj(hidden_states)
         return hidden_states
 
+
 class MLP(nn.Module):
-    """ Very simple multi-layer perceptron (also called FFN)"""
+    """Very simple multi-layer perceptron (also called FFN)"""
 
     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
         super().__init__()
@@ -1066,8 +1078,8 @@ def _set_gradient_checkpointing(self, module, value=False):
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it.
 
-            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See [`ConditionalDETRFeatureExtractor.__call__`] for
-            details.
+            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See
+            [`ConditionalDETRFeatureExtractor.__call__`] for details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -1323,11 +1335,13 @@ def forward(
         all_self_attns = () if output_attentions else None
         all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
 
-        reference_points_before_sigmoid = self.ref_point_head(query_position_embeddings)    # [num_queries, batch_size, 2]
+        reference_points_before_sigmoid = self.ref_point_head(
+            query_position_embeddings
+        )  # [num_queries, batch_size, 2]
         reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
-        obj_center = reference_points[..., :2].transpose(0, 1)  
+        obj_center = reference_points[..., :2].transpose(0, 1)
         # get sine embedding for the query vector
-        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center) 
+        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center)
 
         for idx, decoder_layer in enumerate(self.layers):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1339,7 +1353,7 @@ def forward(
             if idx == 0:
                 pos_transformation = 1
             else:
-                pos_transformation = self.query_scale(hidden_states)  
+                pos_transformation = self.query_scale(hidden_states)
             # apply transformation
             query_sine_embed = query_sine_embed_before_transformation * pos_transformation
             if self.gradient_checkpointing and self.training:
@@ -1372,7 +1386,7 @@ def custom_forward(*inputs):
                     encoder_hidden_states=encoder_hidden_states,
                     encoder_attention_mask=encoder_attention_mask,
                     output_attentions=output_attentions,
-                    is_first=(idx==0)
+                    is_first=(idx == 0),
                 )
 
             hidden_states = layer_outputs[0]
@@ -1401,7 +1415,14 @@ def custom_forward(*inputs):
         if not return_dict:
             return tuple(
                 v
-                for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate, reference_points]
+                for v in [
+                    hidden_states,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                    intermediate,
+                    reference_points,
+                ]
                 if v is not None
             )
         return ConditionalDETRDecoderOutput(
@@ -1410,14 +1431,14 @@ def custom_forward(*inputs):
             attentions=all_self_attns,
             cross_attentions=all_cross_attentions,
             intermediate_hidden_states=intermediate,
-            reference_points=reference_points
+            reference_points=reference_points,
         )
 
 
 @add_start_docstrings(
     """
-    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
-    any specific head on top.
+    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+    hidden-states without any specific head on top.
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
@@ -1570,14 +1591,14 @@ def forward(
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
             intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
-            reference_points=decoder_outputs.reference_points
+            reference_points=decoder_outputs.reference_points,
         )
 
 
 @add_start_docstrings(
     """
-    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
-    such as COCO detection.
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
+    top, for tasks such as COCO detection.
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
@@ -1669,7 +1690,7 @@ def forward(
 
         # class logits + predicted bounding boxes
         logits = self.class_labels_classifier(sequence_output)
-        
+
         reference = outputs.reference_points if return_dict else outputs[-1]
         reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
         outputs_coords = []
@@ -1748,8 +1769,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
-    such as COCO panoptic.
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top,
+    for tasks such as COCO panoptic.
 
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
@@ -2154,9 +2175,9 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
 # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
 class ConditionalDETRLoss(nn.Module):
     """
-    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process happens in two steps: 1)
-    we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
-    of matched ground-truth / prediction (supervise class and box).
+    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process
+    happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2)
+    we supervise each pair of matched ground-truth / prediction (supervise class and box).
 
 
 
@@ -2181,8 +2202,8 @@ def __init__(self, matcher, num_classes, focal_alpha, losses):
     # removed logging parameter, which was part of the original implementation
     def loss_labels(self, outputs, targets, indices, num_boxes):
         """
-        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor of dim
-        [nb_target_boxes]
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
+        of dim [nb_target_boxes]
         """
         if "logits" not in outputs:
             raise KeyError("No logits were found in the outputs")
@@ -2195,12 +2216,19 @@ def loss_labels(self, outputs, targets, indices, num_boxes):
         )
         target_classes[idx] = target_classes_o
 
-        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2]+1],
-                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
+        target_classes_onehot = torch.zeros(
+            [src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],
+            dtype=src_logits.dtype,
+            layout=src_logits.layout,
+            device=src_logits.device,
+        )
         target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
 
-        target_classes_onehot = target_classes_onehot[:,:,:-1]
-        loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) * src_logits.shape[1]
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = (
+            sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
+            * src_logits.shape[1]
+        )
         losses = {"loss_ce": loss_ce}
 
         return losses
@@ -2430,7 +2458,7 @@ def forward(self, outputs, targets):
         # Compute the classification cost.
         alpha = 0.25
         gamma = 2.0
-        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
         pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
         class_cost = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
 

From 5b640592edd2ec0dc3bffe7c986ea9c664ddcba4 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 02:14:09 -0400
Subject: [PATCH 102/233] fixed style and copies

---
 .../conditional_detr/configuration_conditional_detr.py    | 8 ++++----
 ...itional_detr_original_pytorch_checkpoint_to_pytorch.py | 1 +
 utils/check_repo.py                                       | 2 ++
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index bfb759ca246ca..844260f0d12f5 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,7 +27,7 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DeppMeng/ConditionalDETR": "https://huggingface.co/DeppMeng/ConditionalDETR/resolve/main/config.json",
+    "DepuMeng/ConditionalDETR": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
 }
 
 
@@ -36,7 +36,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
     a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
-    [DeppMeng/ConditionalDETR](https://huggingface.co/DeppMeng/ConditionalDETR) architecture.
+    [DepuMeng/ConditionalDETR](https://huggingface.co/DepuMeng/ConditionalDETR) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -114,10 +114,10 @@ class ConditionalDETRConfig(PretrainedConfig):
     ```python
     >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
 
-    >>> # Initializing a CONDITIONAL_DETR DeppMeng/ConditionalDETR style configuration
+    >>> # Initializing a CONDITIONAL_DETR DepuMeng/ConditionalDETR style configuration
     >>> configuration = ConditionalDETRConfig()
 
-    >>> # Initializing a model from the DeppMeng/ConditionalDETR style configuration
+    >>> # Initializing a model from the DepuMeng/ConditionalDETR style configuration
     >>> model = ConditionalDETRModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index 4b56d729d1f35..f3c6fcadb2e2d 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -293,6 +293,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
     model = ConditionalDETRForSegmentation(config) if is_panoptic else ConditionalDETRForObjectDetection(config)
     model.load_state_dict(state_dict)
     model.eval()
+    model.push_to_hub(repo_id=model_name, organization="DepuMeng", commit_message="Add model")
     # verify our conversion
     original_outputs = conditional_detr(pixel_values)
     outputs = model(pixel_values)
diff --git a/utils/check_repo.py b/utils/check_repo.py
index ea3b997f1f126..ffc47aa72af8f 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -60,6 +60,8 @@
     "DetrEncoder",  # Building part of bigger (tested) model.
     "DetrDecoder",  # Building part of bigger (tested) model.
     "DetrDecoderWrapper",  # Building part of bigger (tested) model.
+    "ConditionalDETREncoder",  # Building part of bigger (tested) model.
+    "ConditionalDETRDecoder",  # Building part of bigger (tested) model.
     "M2M100Encoder",  # Building part of bigger (tested) model.
     "M2M100Decoder",  # Building part of bigger (tested) model.
     "MCTCTEncoder",  # Building part of bigger (tested) model.

From cdbeea627fe56488521395cb53fce4f005f93e25 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 02:18:56 -0400
Subject: [PATCH 103/233] fixed hub

---
 .../conditional_detr/configuration_conditional_detr.py    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 844260f0d12f5..3c05d959879ec 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,7 +27,7 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DepuMeng/ConditionalDETR": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
+    "DepuMeng/conditional_detr_resnet50": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
 }
 
 
@@ -36,7 +36,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
     a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
-    [DepuMeng/ConditionalDETR](https://huggingface.co/DepuMeng/ConditionalDETR) architecture.
+    [DepuMeng/conditional_detr_resnet50](https://huggingface.co/DepuMeng/conditional_detr_resnet50) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -114,10 +114,10 @@ class ConditionalDETRConfig(PretrainedConfig):
     ```python
     >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
 
-    >>> # Initializing a CONDITIONAL_DETR DepuMeng/ConditionalDETR style configuration
+    >>> # Initializing a CONDITIONAL_DETR DepuMeng/conditional_detr_resnet50 style configuration
     >>> configuration = ConditionalDETRConfig()
 
-    >>> # Initializing a model from the DepuMeng/ConditionalDETR style configuration
+    >>> # Initializing a model from the DepuMeng/conditional_detr_resnet50 style configuration
     >>> model = ConditionalDETRModel(configuration)
 
     >>> # Accessing the model configuration

From 08f2853ae7395e577609bb2761e8662e4041ff3e Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 02:27:32 -0400
Subject: [PATCH 104/233] fixed style

---
 .../models/conditional_detr/configuration_conditional_detr.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 3c05d959879ec..4291a6ecc4a94 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,7 +27,9 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DepuMeng/conditional_detr_resnet50": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
+    "DepuMeng/conditional_detr_resnet50": (
+        "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json"
+    ),
 }
 
 

From ece15d251c550296d27f0037d35594417bbe6cec Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:33:08 -0400
Subject: [PATCH 105/233] Update README.md

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a481458cde19e..ec8a0fd2e8b39 100644
--- a/README.md
+++ b/README.md
@@ -278,7 +278,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.

From 12a2ffcc2950859186b03ba2a7bd1329f94cb848 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:33:17 -0400
Subject: [PATCH 106/233] Update docs/source/en/_toctree.yml

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/_toctree.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index f6d6a94eee334..d158be23cea58 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -363,7 +363,7 @@
       - local: model_doc/beit
         title: BEiT
       - local: model_doc/conditional_detr
-        title: ConditionalDETR
+        title: Conditional DETR
       - local: model_doc/convnext
         title: ConvNeXT
       - local: model_doc/cvt

From 0d10b38ec42db31fc8d085eedcd286ea206fde24 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:33:58 -0400
Subject: [PATCH 107/233] Update docs/source/en/index.mdx

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/index.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 84d489e22ddf1..4d3aa7711503d 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -68,7 +68,7 @@ The documentation is organized into five sections:
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.

From f6b798601be153e03e928bbb9a039af2dae2ce44 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:34:10 -0400
Subject: [PATCH 108/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 4291a6ecc4a94..722608a7c945e 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Facebook AI Research and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From abf75f426bef7cd51689d43ed0f59d8b4bc786c5 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:34:32 -0400
Subject: [PATCH 109/233] Update
 src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 ...t_conditional_detr_original_pytorch_checkpoint_to_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index f3c6fcadb2e2d..bcb2bbdda9a7d 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert CONDITIONAL_DETR checkpoints."""
+"""Convert Conditional DETR checkpoints."""
 
 
 import argparse

From 3755993d29b7147f2b510d8232959367edee3a01 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:37:15 -0400
Subject: [PATCH 110/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 722608a7c945e..278ed04c14a35 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" CONDITIONAL_DETR model configuration"""
+""" Conditional DETR model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping

From 14bb0e7fac0b2a623d2f5f853b598625c61cffc9 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:48:44 -0400
Subject: [PATCH 111/233] fixed some doc issue

---
 docs/source/en/_toctree.yml                                 | 6 +++---
 src/transformers/models/auto/feature_extraction_auto.py     | 2 --
 .../conditional_detr/configuration_conditional_detr.py      | 4 ++--
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index d158be23cea58..dfbdd503e6005 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -42,8 +42,7 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - isExpanded: false
-      sections:
+    - sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -58,7 +57,8 @@
         title: Summarization
       - local: tasks/multiple_choice
         title: Multiple choice
-      title: Task guides
+      title: Task guides      
+      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 9ba8295b406da..015fd132ef0dc 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,8 +39,6 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
-        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
-        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 278ed04c14a35..2d958a0fbe0f5 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,8 +27,8 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DepuMeng/conditional_detr_resnet50": (
-        "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json"
+    "microsoft/conditional_detr_resnet50": (
+        "https://huggingface.co/microsoft/conditional_detr_resnet50/resolve/main/config.json"
     ),
 }
 

From 6e8b4d828676d91038cee5765f27f864ea44df06 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:36 -0400
Subject: [PATCH 112/233] Update docs/source/en/model_doc/conditional_detr.mdx

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/model_doc/conditional_detr.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/conditional_detr.mdx b/docs/source/en/model_doc/conditional_detr.mdx
index 51316a24e43ee..53129b77ea05a 100644
--- a/docs/source/en/model_doc/conditional_detr.mdx
+++ b/docs/source/en/model_doc/conditional_detr.mdx
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The conditional DETR model was proposed in [ConditionalDETR](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR.
+The Conditional DETR model was proposed in [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR.
 
 The abstract from the paper is the following:
 

From 75b7ba9a0bdc7089b31e8980138e139fcf4eb5ac Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:42 -0400
Subject: [PATCH 113/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/configuration_conditional_detr.py      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 2d958a0fbe0f5..e2a005215d0e0 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -36,9 +36,9 @@
 class ConditionalDETRConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
-    a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
-    [DepuMeng/conditional_detr_resnet50](https://huggingface.co/DepuMeng/conditional_detr_resnet50) architecture.
+    a Conditional DETR model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Conditional DETR
+    [microsoft/conditional-detr-resnet-50](https://huggingface.co/microsoft/conditional-detr-resnet-50) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.

From bc4d1faa7d43e45080037086dcae1fa90e892224 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:54 -0400
Subject: [PATCH 114/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index e2a005215d0e0..d6013fa7e68ea 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -116,7 +116,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     ```python
     >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
 
-    >>> # Initializing a CONDITIONAL_DETR DepuMeng/conditional_detr_resnet50 style configuration
+    >>> # Initializing a Conditional DETR microsoft/conditional-detr-resnet-50 style configuration
     >>> configuration = ConditionalDETRConfig()
 
     >>> # Initializing a model from the DepuMeng/conditional_detr_resnet50 style configuration

From 88c4fae1db8404f645c5ae398f64005543a3556b Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:59 -0400
Subject: [PATCH 115/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index d6013fa7e68ea..4e0b4674356d9 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -119,7 +119,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     >>> # Initializing a Conditional DETR microsoft/conditional-detr-resnet-50 style configuration
     >>> configuration = ConditionalDETRConfig()
 
-    >>> # Initializing a model from the DepuMeng/conditional_detr_resnet50 style configuration
+    >>> # Initializing a model from the microsoft/conditional-detr-resnet-50 style configuration
     >>> model = ConditionalDETRModel(configuration)
 
     >>> # Accessing the model configuration

From 56e5bca3890ba7a872db7e34635f83215c8744bd Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 11:19:56 -0400
Subject: [PATCH 116/233] changed prefix to ConditionalDetr

---
 README.md                                     |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   3 +-
 README_zh-hant.md                             |   3 +-
 docs/source/en/_toctree.yml                   |   4 +-
 docs/source/en/index.mdx                      |   1 +
 docs/source/en/model_doc/conditional_detr.mdx |  20 +-
 src/transformers/__init__.py                  |  24 +-
 .../models/auto/configuration_auto.py         |   2 +-
 src/transformers/models/auto/modeling_auto.py |   6 +-
 .../models/conditional_detr/__init__.py       |  28 +--
 .../configuration_conditional_detr.py         |  14 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |  14 +-
 .../feature_extraction_conditional_detr.py    |  26 +--
 .../modeling_conditional_detr.py              | 212 +++++++++---------
 .../utils/dummy_vision_objects.py             |   2 +-
 ...est_feature_extraction_conditional_detr.py |  16 +-
 .../test_modeling_conditional_detr.py         |  44 ++--
 utils/check_repo.py                           |   4 +-
 19 files changed, 215 insertions(+), 210 deletions(-)

diff --git a/README.md b/README.md
index ec8a0fd2e8b39..1ba4614902013 100644
--- a/README.md
+++ b/README.md
@@ -279,6 +279,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_ko.md b/README_ko.md
index de1f2b3e13560..51ad1d1b18693 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -228,6 +228,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index f0ffb27c4101f..b2bf8f6043e0d 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -252,7 +252,8 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang发布。
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 发布。
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 03687087f7a22..366352ef5eb4e 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -264,7 +264,8 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index dfbdd503e6005..a4cd1005e3e83 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -57,7 +57,7 @@
         title: Summarization
       - local: tasks/multiple_choice
         title: Multiple choice
-      title: Task guides      
+      title: Task guides
       isExpanded: false
     title: Natural Language Processing
   - sections:
@@ -503,4 +503,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
+  title: API
\ No newline at end of file
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 4d3aa7711503d..579c6172d6a6b 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -69,6 +69,7 @@ The documentation is organized into five sections:
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/model_doc/conditional_detr.mdx b/docs/source/en/model_doc/conditional_detr.mdx
index 53129b77ea05a..d5846cbfee327 100644
--- a/docs/source/en/model_doc/conditional_detr.mdx
+++ b/docs/source/en/model_doc/conditional_detr.mdx
@@ -24,30 +24,30 @@ The abstract from the paper is the following:
 This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The original code can be found [here](https://github.com/Atten4Vis/ConditionalDETR).
 
 
-## ConditionalDETRConfig
+## ConditionalDetrConfig
 
-[[autodoc]] ConditionalDETRConfig
+[[autodoc]] ConditionalDetrConfig
 
-## ConditionalDETRFeatureExtractor
+## ConditionalDetrFeatureExtractor
 
-[[autodoc]] ConditionalDETRFeatureExtractor
+[[autodoc]] ConditionalDetrFeatureExtractor
     - __call__
     - pad_and_create_pixel_mask
     - post_process
     - post_process_segmentation
     - post_process_panoptic
 
-## ConditionalDETRModel
+## ConditionalDetrModel
 
-[[autodoc]] ConditionalDETRModel
+[[autodoc]] ConditionalDetrModel
     - forward
 
-## ConditionalDETRForObjectDetection
+## ConditionalDetrForObjectDetection
 
-[[autodoc]] ConditionalDETRForObjectDetection
+[[autodoc]] ConditionalDetrForObjectDetection
     - forward
 
-## ConditionalDETRForSegmentation
+## ConditionalDetrForSegmentation
 
-[[autodoc]] ConditionalDETRForSegmentation
+[[autodoc]] ConditionalDetrForSegmentation
     - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index b96724b992093..6abc53c85008e 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -172,7 +172,7 @@
         "CLIPVisionConfig",
     ],
     "models.codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenTokenizer"],
-    "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDETRConfig"],
+    "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDetrConfig"],
     "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
     "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
     "models.cpm": [],
@@ -661,7 +661,7 @@
     _import_structure["models.convnext"].append("ConvNextFeatureExtractor")
     _import_structure["models.deit"].append("DeiTFeatureExtractor")
     _import_structure["models.detr"].append("DetrFeatureExtractor")
-    _import_structure["models.conditional_detr"].append("ConditionalDETRFeatureExtractor")
+    _import_structure["models.conditional_detr"].append("ConditionalDetrFeatureExtractor")
     _import_structure["models.donut"].append("DonutFeatureExtractor")
     _import_structure["models.dpt"].append("DPTFeatureExtractor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaProcessor"])
@@ -713,10 +713,10 @@
     _import_structure["models.conditional_detr"].extend(
         [
             "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "ConditionalDETRForObjectDetection",
-            "ConditionalDETRForSegmentation",
-            "ConditionalDETRModel",
-            "ConditionalDETRPreTrainedModel",
+            "ConditionalDetrForObjectDetection",
+            "ConditionalDetrForSegmentation",
+            "ConditionalDetrModel",
+            "ConditionalDetrPreTrainedModel",
         ]
     )
 
@@ -3086,7 +3086,7 @@
         CLIPVisionConfig,
     )
     from .models.codegen import CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP, CodeGenConfig, CodeGenTokenizer
-    from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDETRConfig
+    from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDetrConfig
     from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
     from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
@@ -3510,7 +3510,7 @@
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
         from .models.clip import CLIPFeatureExtractor
-        from .models.conditional_detr import ConditionalDETRFeatureExtractor
+        from .models.conditional_detr import ConditionalDetrFeatureExtractor
         from .models.convnext import ConvNextFeatureExtractor
         from .models.deit import DeiTFeatureExtractor
         from .models.detr import DetrFeatureExtractor
@@ -3542,10 +3542,10 @@
     else:
         from .models.conditional_detr import (
             CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ConditionalDETRForObjectDetection,
-            ConditionalDETRForSegmentation,
-            ConditionalDETRModel,
-            ConditionalDETRPreTrainedModel,
+            ConditionalDetrForObjectDetection,
+            ConditionalDetrForSegmentation,
+            ConditionalDetrModel,
+            ConditionalDetrPreTrainedModel,
         )
         from .models.deformable_detr import (
             DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index b28bb0665bc3d..258fb5d645bf1 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -43,7 +43,7 @@
         ("canine", "CanineConfig"),
         ("clip", "CLIPConfig"),
         ("codegen", "CodeGenConfig"),
-        ("conditional_detr", "ConditionalDETRConfig"),
+        ("conditional_detr", "ConditionalDetrConfig"),
         ("convbert", "ConvBertConfig"),
         ("convnext", "ConvNextConfig"),
         ("ctrl", "CTRLConfig"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 322a48f1cefeb..8ee36125de25d 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -42,7 +42,7 @@
         ("canine", "CanineModel"),
         ("clip", "CLIPModel"),
         ("codegen", "CodeGenModel"),
-        ("conditional_detr", "ConditionalDETRModel"),
+        ("conditional_detr", "ConditionalDetrModel"),
         ("convbert", "ConvBertModel"),
         ("convnext", "ConvNextModel"),
         ("ctrl", "CTRLModel"),
@@ -374,7 +374,7 @@
     [
         # Do not add new models here, this class will be deprecated in the future.
         # Model for Image Segmentation mapping
-        ("conditional_detr", "ConditionalDETRForSegmentation"),
+        ("conditional_detr", "ConditionalDetrForSegmentation"),
         ("detr", "DetrForSegmentation"),
     ]
 )
@@ -457,7 +457,7 @@
 MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Object Detection mapping
-        ("conditional_detr", "ConditionalDETRForObjectDetection"),
+        ("conditional_detr", "ConditionalDetrForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
diff --git a/src/transformers/models/conditional_detr/__init__.py b/src/transformers/models/conditional_detr/__init__.py
index 1b72e7d7a3238..c2f1bdfdbbaae 100644
--- a/src/transformers/models/conditional_detr/__init__.py
+++ b/src/transformers/models/conditional_detr/__init__.py
@@ -24,8 +24,8 @@
 _import_structure = {
     "configuration_conditional_detr": [
         "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "ConditionalDETRConfig",
-        "ConditionalDETROnnxConfig",
+        "ConditionalDetrConfig",
+        "ConditionalDetrOnnxConfig",
     ]
 }
 
@@ -35,7 +35,7 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["feature_extraction_conditional_detr"] = ["ConditionalDETRFeatureExtractor"]
+    _import_structure["feature_extraction_conditional_detr"] = ["ConditionalDetrFeatureExtractor"]
 
 try:
     if not is_timm_available():
@@ -45,18 +45,18 @@
 else:
     _import_structure["modeling_conditional_detr"] = [
         "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "ConditionalDETRForObjectDetection",
-        "ConditionalDETRForSegmentation",
-        "ConditionalDETRModel",
-        "ConditionalDETRPreTrainedModel",
+        "ConditionalDetrForObjectDetection",
+        "ConditionalDetrForSegmentation",
+        "ConditionalDetrModel",
+        "ConditionalDetrPreTrainedModel",
     ]
 
 
 if TYPE_CHECKING:
     from .configuration_conditional_detr import (
         CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        ConditionalDETRConfig,
-        ConditionalDETROnnxConfig,
+        ConditionalDetrConfig,
+        ConditionalDetrOnnxConfig,
     )
 
     try:
@@ -65,7 +65,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .feature_extraction_conditional_detr import ConditionalDETRFeatureExtractor
+        from .feature_extraction_conditional_detr import ConditionalDetrFeatureExtractor
 
     try:
         if not is_timm_available():
@@ -75,10 +75,10 @@
     else:
         from .modeling_conditional_detr import (
             CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ConditionalDETRForObjectDetection,
-            ConditionalDETRForSegmentation,
-            ConditionalDETRModel,
-            ConditionalDETRPreTrainedModel,
+            ConditionalDetrForObjectDetection,
+            ConditionalDetrForSegmentation,
+            ConditionalDetrModel,
+            ConditionalDetrPreTrainedModel,
         )
 
 else:
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 4e0b4674356d9..6abcf85894735 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -33,9 +33,9 @@
 }
 
 
-class ConditionalDETRConfig(PretrainedConfig):
+class ConditionalDetrConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
+    This is the configuration class to store the configuration of a [`ConditionalDetrModel`]. It is used to instantiate
     a Conditional DETR model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Conditional DETR
     [microsoft/conditional-detr-resnet-50](https://huggingface.co/microsoft/conditional-detr-resnet-50) architecture.
@@ -48,7 +48,7 @@ class ConditionalDETRConfig(PretrainedConfig):
             The number of input channels.
         num_queries (`int`, *optional*, defaults to 100):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
-            [`ConditionalDETRModel`] can detect in a single image. For COCO, we recommend 100 queries.
+            [`ConditionalDetrModel`] can detect in a single image. For COCO, we recommend 100 queries.
         d_model (`int`, *optional*, defaults to 256):
             Dimension of the layers.
         encoder_layers (`int`, *optional*, defaults to 6):
@@ -114,13 +114,13 @@ class ConditionalDETRConfig(PretrainedConfig):
     Examples:
 
     ```python
-    >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
+    >>> from transformers import ConditionalDetrModel, ConditionalDetrConfig
 
     >>> # Initializing a Conditional DETR microsoft/conditional-detr-resnet-50 style configuration
-    >>> configuration = ConditionalDETRConfig()
+    >>> configuration = ConditionalDetrConfig()
 
     >>> # Initializing a model from the microsoft/conditional-detr-resnet-50 style configuration
-    >>> model = ConditionalDETRModel(configuration)
+    >>> model = ConditionalDetrModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
@@ -214,7 +214,7 @@ def hidden_size(self) -> int:
         return self.d_model
 
 
-class ConditionalDETROnnxConfig(OnnxConfig):
+class ConditionalDetrOnnxConfig(OnnxConfig):
 
     torch_onnx_minimum_version = version.parse("1.11")
 
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index bcb2bbdda9a7d..904530c44c227 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -26,10 +26,10 @@
 import requests
 from huggingface_hub import hf_hub_download
 from transformers import (
-    ConditionalDETRConfig,
-    ConditionalDETRFeatureExtractor,
-    ConditionalDETRForObjectDetection,
-    ConditionalDETRForSegmentation,
+    ConditionalDetrConfig,
+    ConditionalDetrFeatureExtractor,
+    ConditionalDetrForObjectDetection,
+    ConditionalDetrForSegmentation,
 )
 from transformers.utils import logging
 
@@ -226,7 +226,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
     """
 
     # load default config
-    config = ConditionalDETRConfig()
+    config = ConditionalDetrConfig()
     # set backbone and dilation attributes
     if "resnet101" in model_name:
         config.backbone = "resnet101"
@@ -246,7 +246,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
 
     # load feature extractor
     format = "coco_panoptic" if is_panoptic else "coco_detection"
-    feature_extractor = ConditionalDETRFeatureExtractor(format=format)
+    feature_extractor = ConditionalDetrFeatureExtractor(format=format)
 
     # prepare image
     img = prepare_img()
@@ -290,7 +290,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
                 val = state_dict.pop(key)
                 state_dict[prefix + key] = val
     # finally, create HuggingFace model and load state dict
-    model = ConditionalDETRForSegmentation(config) if is_panoptic else ConditionalDETRForObjectDetection(config)
+    model = ConditionalDetrForSegmentation(config) if is_panoptic else ConditionalDetrForObjectDetection(config)
     model.load_state_dict(state_dict)
     model.eval()
     model.push_to_hub(repo_id=model_name, organization="DepuMeng", commit_message="Add model")
diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index f37e3e1fb69bf..ec324f1f72456 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -119,7 +119,7 @@ def id_to_rgb(id_map):
     return color
 
 
-class ConditionalDETRFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
     Constructs a CONDITIONAL_DETR feature extractor.
 
@@ -431,11 +431,11 @@ def __call__(
             annotations (`Dict`, `List[Dict]`, *optional*):
                 The corresponding annotations in COCO format.
 
-                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_detection"`, the
+                In case [`ConditionalDetrFeatureExtractor`] was initialized with `format = "coco_detection"`, the
                 annotations for each image should have the following format: {'image_id': int, 'annotations':
                 [annotation]}, with the annotations being a list of COCO object annotations.
 
-                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the
+                In case [`ConditionalDetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the
                 annotations for each image should have the following format: {'image_id': int, 'file_name': str,
                 'segments_info': [segment_info]} with segments_info being a list of COCO panoptic annotations.
 
@@ -445,7 +445,7 @@ def __call__(
 
             masks_path (`pathlib.Path`, *optional*):
                 Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only
-                relevant in case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
+                relevant in case [`ConditionalDetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
 
             pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether or not to pad images up to the largest image in a batch and create a pixel mask.
@@ -676,11 +676,11 @@ def pad_and_create_pixel_mask(
     # inspired by https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py#L258
     def post_process(self, outputs, target_sizes):
         """
-        Converts the output of [`ConditionalDETRForObjectDetection`] into the format expected by the COCO api. Only
+        Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the COCO api. Only
         supports PyTorch.
 
         Args:
-            outputs ([`ConditionalDETRObjectDetectionOutput`]):
+            outputs ([`ConditionalDetrObjectDetectionOutput`]):
                 Raw outputs of the model.
             target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                 Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
@@ -717,11 +717,11 @@ def post_process(self, outputs, target_sizes):
 
     def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
         """
-        Converts the output of [`ConditionalDETRForSegmentation`] into image segmentation predictions. Only supports
+        Converts the output of [`ConditionalDetrForSegmentation`] into image segmentation predictions. Only supports
         PyTorch.
 
         Parameters:
-            outputs ([`ConditionalDETRSegmentationOutput`]):
+            outputs ([`ConditionalDetrSegmentationOutput`]):
                 Raw outputs of the model.
             target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
                 Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
@@ -760,14 +760,14 @@ def to_tuple(tup):
     # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
     def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
         """
-        Converts the output of [`ConditionalDETRForSegmentation`] into actual instance segmentation predictions. Only
+        Converts the output of [`ConditionalDetrForSegmentation`] into actual instance segmentation predictions. Only
         supports PyTorch.
 
         Args:
             results (`List[Dict]`):
-                Results list obtained by [`~ConditionalDETRFeatureExtractor.post_process`], to which "masks" results
+                Results list obtained by [`~ConditionalDetrFeatureExtractor.post_process`], to which "masks" results
                 will be added.
-            outputs ([`ConditionalDETRSegmentationOutput`]):
+            outputs ([`ConditionalDetrSegmentationOutput`]):
                 Raw outputs of the model.
             orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                 Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
@@ -804,11 +804,11 @@ def post_process_instance(self, results, outputs, orig_target_sizes, max_target_
     # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
     def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
         """
-        Converts the output of [`ConditionalDETRForSegmentation`] into actual panoptic predictions. Only supports
+        Converts the output of [`ConditionalDetrForSegmentation`] into actual panoptic predictions. Only supports
         PyTorch.
 
         Parameters:
-            outputs ([`ConditionalDETRSegmentationOutput`]):
+            outputs ([`ConditionalDetrSegmentationOutput`]):
                 Raw outputs of the model.
             processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
                 Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 54dea544f3da9..05343a89f3ed0 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -39,7 +39,7 @@
     replace_return_docstrings,
     requires_backends,
 )
-from .configuration_conditional_detr import ConditionalDETRConfig
+from .configuration_conditional_detr import ConditionalDetrConfig
 
 
 if is_scipy_available():
@@ -53,7 +53,7 @@
 
 logger = logging.get_logger(__name__)
 
-_CONFIG_FOR_DOC = "ConditionalDETRConfig"
+_CONFIG_FOR_DOC = "ConditionalDetrConfig"
 _CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
 
 CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -63,7 +63,7 @@
 
 
 @dataclass
-class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
+class ConditionalDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
     """
     Base class for outputs of the Conditional DETR decoder. This class adds one attribute to
     BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output
@@ -95,7 +95,7 @@ class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
 
 
 @dataclass
-class ConditionalDETRModelOutput(Seq2SeqModelOutput):
+class ConditionalDetrModelOutput(Seq2SeqModelOutput):
     """
     Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to
     Seq2SeqModelOutput, namely an optional stack of intermediate decoder activations, i.e. the output of each decoder
@@ -137,10 +137,10 @@ class ConditionalDETRModelOutput(Seq2SeqModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->ConditionalDETR
-class ConditionalDETRObjectDetectionOutput(ModelOutput):
+# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->ConditionalDetr
+class ConditionalDetrObjectDetectionOutput(ModelOutput):
     """
-    Output type of [`ConditionalDETRForObjectDetection`].
+    Output type of [`ConditionalDetrForObjectDetection`].
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
@@ -154,7 +154,7 @@ class ConditionalDETRObjectDetectionOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            possible padding). You can use [`~ConditionalDetrFeatureExtractor.post_process`] to retrieve the
             unnormalized bounding boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
@@ -201,10 +201,10 @@ class ConditionalDETRObjectDetectionOutput(ModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.detr.modeling_detr.DetrSegmentationOutput with Detr->ConditionalDETR
-class ConditionalDETRSegmentationOutput(ModelOutput):
+# Copied from transformers.models.detr.modeling_detr.DetrSegmentationOutput with Detr->ConditionalDetr
+class ConditionalDetrSegmentationOutput(ModelOutput):
     """
-    Output type of [`ConditionalDETRForSegmentation`].
+    Output type of [`ConditionalDetrForSegmentation`].
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
@@ -218,12 +218,12 @@ class ConditionalDETRSegmentationOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            possible padding). You can use [`~ConditionalDetrFeatureExtractor.post_process`] to retrieve the
             unnormalized bounding boxes.
         pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
             Segmentation masks logits for all queries. See also
-            [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
-            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation
+            [`~ConditionalDetrFeatureExtractor.post_process_segmentation`] or
+            [`~ConditionalDetrFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation
             masks respectively.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
@@ -272,8 +272,8 @@ class ConditionalDETRSegmentationOutput(ModelOutput):
 
 # BELOW: utilities copied from
 # https://github.com/facebookresearch/detr/blob/master/backbone.py
-# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDETR
-class ConditionalDETRFrozenBatchNorm2d(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDetr
+class ConditionalDetrFrozenBatchNorm2d(nn.Module):
     """
     BatchNorm2d where the batch statistics and the affine parameters are fixed.
 
@@ -282,7 +282,7 @@ class ConditionalDETRFrozenBatchNorm2d(nn.Module):
     """
 
     def __init__(self, n):
-        super(ConditionalDETRFrozenBatchNorm2d, self).__init__()
+        super(ConditionalDetrFrozenBatchNorm2d, self).__init__()
         self.register_buffer("weight", torch.ones(n))
         self.register_buffer("bias", torch.zeros(n))
         self.register_buffer("running_mean", torch.zeros(n))
@@ -295,7 +295,7 @@ def _load_from_state_dict(
         if num_batches_tracked_key in state_dict:
             del state_dict[num_batches_tracked_key]
 
-        super(ConditionalDETRFrozenBatchNorm2d, self)._load_from_state_dict(
+        super(ConditionalDetrFrozenBatchNorm2d, self)._load_from_state_dict(
             state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
         )
 
@@ -312,12 +312,12 @@ def forward(self, x):
         return x * scale + bias
 
 
-# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->ConditionalDETR
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->ConditionalDetr
 def replace_batch_norm(m, name=""):
     for attr_str in dir(m):
         target_attr = getattr(m, attr_str)
         if isinstance(target_attr, nn.BatchNorm2d):
-            frozen = ConditionalDETRFrozenBatchNorm2d(target_attr.num_features)
+            frozen = ConditionalDetrFrozenBatchNorm2d(target_attr.num_features)
             bn = getattr(m, attr_str)
             frozen.weight.data.copy_(bn.weight)
             frozen.bias.data.copy_(bn.bias)
@@ -328,11 +328,11 @@ def replace_batch_norm(m, name=""):
         replace_batch_norm(ch, n)
 
 
-class ConditionalDETRTimmConvEncoder(nn.Module):
+class ConditionalDetrTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.
 
-    nn.BatchNorm2d layers are replaced by ConditionalDETRFrozenBatchNorm2d as defined above.
+    nn.BatchNorm2d layers are replaced by ConditionalDetrFrozenBatchNorm2d as defined above.
 
     """
 
@@ -369,8 +369,8 @@ def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         return out
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->ConditionalDETR
-class ConditionalDETRConvModel(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->ConditionalDetr
+class ConditionalDetrConvModel(nn.Module):
     """
     This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
     """
@@ -391,7 +391,7 @@ def forward(self, pixel_values, pixel_mask):
         return out, pos
 
 
-# Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDETR
+# Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDetr
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
@@ -406,8 +406,8 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrSinePositionEmbedding with Detr->ConditionalDETR
-class ConditionalDETRSinePositionEmbedding(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrSinePositionEmbedding with Detr->ConditionalDetr
+class ConditionalDetrSinePositionEmbedding(nn.Module):
     """
     This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
     need paper, generalized to work on images.
@@ -444,8 +444,8 @@ def forward(self, pixel_values, pixel_mask):
         return pos
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->ConditionalDETR
-class ConditionalDETRLearnedPositionEmbedding(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->ConditionalDetr
+class ConditionalDetrLearnedPositionEmbedding(nn.Module):
     """
     This module learns positional embeddings up to a fixed maximum size.
     """
@@ -468,14 +468,14 @@ def forward(self, pixel_values, pixel_mask=None):
         return pos
 
 
-# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->ConditionalDETR
+# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->ConditionalDetr
 def build_position_encoding(config):
     n_steps = config.d_model // 2
     if config.position_embedding_type == "sine":
         # TODO find a better way of exposing other arguments
-        position_embedding = ConditionalDETRSinePositionEmbedding(n_steps, normalize=True)
+        position_embedding = ConditionalDetrSinePositionEmbedding(n_steps, normalize=True)
     elif config.position_embedding_type == "learned":
-        position_embedding = ConditionalDETRLearnedPositionEmbedding(n_steps)
+        position_embedding = ConditionalDetrLearnedPositionEmbedding(n_steps)
     else:
         raise ValueError(f"Not supported {config.position_embedding_type}")
 
@@ -634,7 +634,7 @@ def forward(
         return attn_output, attn_weights_reshaped
 
 
-class ConditionalDETRAttention(nn.Module):
+class ConditionalDetrAttention(nn.Module):
     """
     Cross-Attention used in Conditional DETR 'Conditional DETR for Fast Training Convergence' paper.
 
@@ -750,8 +750,8 @@ def forward(
         return attn_output, attn_weights_reshaped
 
 
-class ConditionalDETREncoderLayer(nn.Module):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrEncoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__()
         self.embed_dim = config.d_model
         self.self_attn = DetrAttention(
@@ -819,8 +819,8 @@ def forward(
         return outputs
 
 
-class ConditionalDETRDecoderLayer(nn.Module):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrDecoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__()
         self.embed_dim = config.d_model
 
@@ -832,7 +832,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.sa_kpos_proj = nn.Linear(d_model, d_model)
         self.sa_v_proj = nn.Linear(d_model, d_model)
 
-        self.self_attn = ConditionalDETRAttention(
+        self.self_attn = ConditionalDetrAttention(
             embed_dim=self.embed_dim,
             out_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
@@ -852,7 +852,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.ca_v_proj = nn.Linear(d_model, d_model)
         self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)
 
-        self.encoder_attn = ConditionalDETRAttention(
+        self.encoder_attn = ConditionalDetrAttention(
             self.embed_dim * 2, self.embed_dim, config.decoder_attention_heads, dropout=config.attention_dropout
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
@@ -989,8 +989,8 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead with Detr->ConditionalDETR
-class ConditionalDETRClassificationHead(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead with Detr->ConditionalDetr
+class ConditionalDetrClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
 
     def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
@@ -1023,9 +1023,9 @@ def forward(self, x):
         return x
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->ConditionalDETR
-class ConditionalDETRPreTrainedModel(PreTrainedModel):
-    config_class = ConditionalDETRConfig
+# Copied from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->ConditionalDetr
+class ConditionalDetrPreTrainedModel(PreTrainedModel):
+    config_class = ConditionalDetrConfig
     base_model_prefix = "model"
     main_input_name = "pixel_values"
 
@@ -1033,12 +1033,12 @@ def _init_weights(self, module):
         std = self.config.init_std
         xavier_std = self.config.init_xavier_std
 
-        if isinstance(module, ConditionalDETRMHAttentionMap):
+        if isinstance(module, ConditionalDetrMHAttentionMap):
             nn.init.zeros_(module.k_linear.bias)
             nn.init.zeros_(module.q_linear.bias)
             nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
             nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
-        elif isinstance(module, ConditionalDETRLearnedPositionEmbedding):
+        elif isinstance(module, ConditionalDetrLearnedPositionEmbedding):
             nn.init.uniform_(module.row_embeddings.weight)
             nn.init.uniform_(module.column_embeddings.weight)
         if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
@@ -1053,7 +1053,7 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, ConditionalDETRDecoder):
+        if isinstance(module, ConditionalDetrDecoder):
             module.gradient_checkpointing = value
 
 
@@ -1067,7 +1067,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     and behavior.
 
     Parameters:
-        config ([`ConditionalDETRConfig`]):
+        config ([`ConditionalDetrConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1078,8 +1078,8 @@ def _set_gradient_checkpointing(self, module, value=False):
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it.
 
-            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See
-            [`ConditionalDETRFeatureExtractor.__call__`] for details.
+            Pixel values can be obtained using [`ConditionalDetrFeatureExtractor`]. See
+            [`ConditionalDetrFeatureExtractor.__call__`] for details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -1112,11 +1112,11 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR
-class ConditionalDETREncoder(ConditionalDETRPreTrainedModel):
+# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDetr
+class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
-    [`ConditionalDETREncoderLayer`].
+    [`ConditionalDetrEncoderLayer`].
 
     The encoder updates the flattened feature map through multiple self-attention layers.
 
@@ -1125,16 +1125,16 @@ class ConditionalDETREncoder(ConditionalDETRPreTrainedModel):
     - position_embeddings are added to the forward pass.
 
     Args:
-        config: ConditionalDETRConfig
+        config: ConditionalDetrConfig
     """
 
-    def __init__(self, config: ConditionalDETRConfig):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         self.dropout = config.dropout
         self.layerdrop = config.encoder_layerdrop
 
-        self.layers = nn.ModuleList([ConditionalDETREncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layers = nn.ModuleList([ConditionalDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
 
         # in the original CONDITIONAL_DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
 
@@ -1222,9 +1222,9 @@ def forward(
         )
 
 
-class ConditionalDETRDecoder(ConditionalDETRPreTrainedModel):
+class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
     """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`ConditionalDETRDecoderLayer`].
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`ConditionalDetrDecoderLayer`].
 
     The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
 
@@ -1234,15 +1234,15 @@ class ConditionalDETRDecoder(ConditionalDETRPreTrainedModel):
     - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
 
     Args:
-        config: ConditionalDETRConfig
+        config: ConditionalDetrConfig
     """
 
-    def __init__(self, config: ConditionalDETRConfig):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
 
-        self.layers = nn.ModuleList([ConditionalDETRDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layers = nn.ModuleList([ConditionalDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
         # in CONDITIONAL_DETR, the decoder uses layernorm after the last decoder layer output
         self.layernorm = nn.LayerNorm(config.d_model)
         d_model = config.d_model
@@ -1425,7 +1425,7 @@ def custom_forward(*inputs):
                 ]
                 if v is not None
             )
-        return ConditionalDETRDecoderOutput(
+        return ConditionalDetrDecoderOutput(
             last_hidden_state=hidden_states,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
@@ -1442,22 +1442,22 @@ def custom_forward(*inputs):
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-class ConditionalDETRModel(ConditionalDETRPreTrainedModel):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrModel(ConditionalDetrPreTrainedModel):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # Create backbone + positional encoding
-        backbone = ConditionalDETRTimmConvEncoder(config.backbone, config.dilation)
+        backbone = ConditionalDetrTimmConvEncoder(config.backbone, config.dilation)
         position_embeddings = build_position_encoding(config)
-        self.backbone = ConditionalDETRConvModel(backbone, position_embeddings)
+        self.backbone = ConditionalDetrConvModel(backbone, position_embeddings)
 
         # Create projection layer
         self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
 
         self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
 
-        self.encoder = ConditionalDETREncoder(config)
-        self.decoder = ConditionalDETRDecoder(config)
+        self.encoder = ConditionalDetrEncoder(config)
+        self.decoder = ConditionalDetrDecoder(config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1477,7 +1477,7 @@ def unfreeze_backbone(self):
             param.requires_grad_(True)
 
     @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=ConditionalDETRModelOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=ConditionalDetrModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values,
@@ -1496,15 +1496,15 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRModel
+        >>> from transformers import ConditionalDetrFeatureExtractor, ConditionalDetrModel
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR")
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
@@ -1582,7 +1582,7 @@ def forward(
         if not return_dict:
             return decoder_outputs + encoder_outputs
 
-        return ConditionalDETRModelOutput(
+        return ConditionalDetrModelOutput(
             last_hidden_state=decoder_outputs.last_hidden_state,
             decoder_hidden_states=decoder_outputs.hidden_states,
             decoder_attentions=decoder_outputs.attentions,
@@ -1602,18 +1602,18 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-class ConditionalDETRForObjectDetection(ConditionalDETRPreTrainedModel):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # CONDITIONAL_DETR encoder-decoder model
-        self.model = ConditionalDETRModel(config)
+        self.model = ConditionalDetrModel(config)
 
         # Object detection heads
         self.class_labels_classifier = nn.Linear(
             config.d_model, config.num_labels
         )  # We add one for the "no object" class
-        self.bbox_predictor = ConditionalDETRMLPPredictionHead(
+        self.bbox_predictor = ConditionalDetrMLPPredictionHead(
             input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
         )
 
@@ -1629,7 +1629,7 @@ def _set_aux_loss(self, outputs_class, outputs_coord):
         return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
 
     @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=ConditionalDETRObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=ConditionalDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values,
@@ -1655,15 +1655,15 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForObjectDetection
+        >>> from transformers import ConditionalDetrFeatureExtractor, ConditionalDetrForObjectDetection
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1703,12 +1703,12 @@ def forward(
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
             # First: create the matcher
-            matcher = ConditionalDETRHungarianMatcher(
+            matcher = ConditionalDetrHungarianMatcher(
                 class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
             )
             # Second: create the criterion
             losses = ["labels", "boxes", "cardinality"]
-            criterion = ConditionalDETRLoss(
+            criterion = ConditionalDetrLoss(
                 matcher=matcher,
                 num_classes=self.config.num_labels,
                 focal_alpha=self.config.focal_alpha,
@@ -1751,7 +1751,7 @@ def forward(
                 output = (logits, pred_boxes) + outputs
             return ((loss, loss_dict) + output) if loss is not None else output
 
-        return ConditionalDETRObjectDetectionOutput(
+        return ConditionalDetrObjectDetectionOutput(
             loss=loss,
             loss_dict=loss_dict,
             logits=logits,
@@ -1775,22 +1775,22 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-class ConditionalDETRForSegmentation(ConditionalDETRPreTrainedModel):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrForSegmentation(ConditionalDetrPreTrainedModel):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # object detection model
-        self.conditional_detr = ConditionalDETRForObjectDetection(config)
+        self.conditional_detr = ConditionalDetrForObjectDetection(config)
 
         # segmentation head
         hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
         intermediate_channel_sizes = self.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes
 
-        self.mask_head = ConditionalDETRMaskHeadSmallConv(
+        self.mask_head = ConditionalDetrMaskHeadSmallConv(
             hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
         )
 
-        self.bbox_attention = ConditionalDETRMHAttentionMap(
+        self.bbox_attention = ConditionalDetrMHAttentionMap(
             hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
         )
 
@@ -1798,7 +1798,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.post_init()
 
     @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=ConditionalDETRSegmentationOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=ConditionalDetrSegmentationOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values,
@@ -1832,16 +1832,16 @@ def forward(
         >>> import torch
         >>> import numpy
 
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
+        >>> from transformers import ConditionalDetrFeatureExtractor, ConditionalDetrForSegmentation
         >>> from transformers.models.conditional_detr.feature_extraction_conditional_detr import rgb_to_id
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained(
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained(
         ...     "facebook/conditional_detr-resnet-50-panoptic"
         ... )
-        >>> model = ConditionalDETRForSegmentation.from_pretrained("facebook/conditional_detr-resnet-50-panoptic")
+        >>> model = ConditionalDetrForSegmentation.from_pretrained("facebook/conditional_detr-resnet-50-panoptic")
 
         >>> # prepare image for the model
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
@@ -1849,7 +1849,7 @@ def forward(
         >>> # forward pass
         >>> outputs = model(**inputs)
 
-        >>> # use the `post_process_panoptic` method of `ConditionalDETRFeatureExtractor` to convert to COCO format
+        >>> # use the `post_process_panoptic` method of `ConditionalDetrFeatureExtractor` to convert to COCO format
         >>> processed_sizes = torch.as_tensor(inputs["pixel_values"].shape[-2:]).unsqueeze(0)
         >>> result = feature_extractor.post_process_panoptic(outputs, processed_sizes)[0]
 
@@ -1947,12 +1947,12 @@ def forward(
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
             # First: create the matcher
-            matcher = ConditionalDETRHungarianMatcher(
+            matcher = ConditionalDetrHungarianMatcher(
                 class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
             )
             # Second: create the criterion
             losses = ["labels", "boxes", "cardinality", "masks"]
-            criterion = ConditionalDETRLoss(
+            criterion = ConditionalDetrLoss(
                 matcher=matcher,
                 num_classes=self.config.num_labels,
                 focal_alpha=self.config.focal_alpha,
@@ -1991,7 +1991,7 @@ def forward(
                 output = (logits, pred_boxes, pred_masks) + decoder_outputs + encoder_outputs
             return ((loss, loss_dict) + output) if loss is not None else output
 
-        return ConditionalDETRSegmentationOutput(
+        return ConditionalDetrSegmentationOutput(
             loss=loss,
             loss_dict=loss_dict,
             logits=logits,
@@ -2013,8 +2013,8 @@ def _expand(tensor, length: int):
 
 
 # taken from https://github.com/facebookresearch/detr/blob/master/models/segmentation.py
-# Copied from transformers.models.detr.modeling_detr.DetrMaskHeadSmallConv with Detr->ConditionalDETR
-class ConditionalDETRMaskHeadSmallConv(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrMaskHeadSmallConv with Detr->ConditionalDetr
+class ConditionalDetrMaskHeadSmallConv(nn.Module):
     """
     Simple convolutional head, using group norm. Upsampling is done using a FPN approach
     """
@@ -2094,8 +2094,8 @@ def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
         return x
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->ConditionalDETR
-class ConditionalDETRMHAttentionMap(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->ConditionalDetr
+class ConditionalDetrMHAttentionMap(nn.Module):
     """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
 
     def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
@@ -2173,16 +2173,16 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
 
 
 # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
-class ConditionalDETRLoss(nn.Module):
+class ConditionalDetrLoss(nn.Module):
     """
-    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process
+    This class computes the losses for ConditionalDetrForObjectDetection/ConditionalDetrForSegmentation. The process
     happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2)
     we supervise each pair of matched ground-truth / prediction (supervise class and box).
 
 
 
     Args:
-        matcher (`ConditionalDETRHungarianMatcher`):
+        matcher (`ConditionalDetrHungarianMatcher`):
             Module able to compute a matching between targets and proposals.
         num_classes (`int`):
             Number of object categories, omitting the special no-object category.
@@ -2375,7 +2375,7 @@ def forward(self, outputs, targets):
 
 
 # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
-class ConditionalDETRMLPPredictionHead(nn.Module):
+class ConditionalDetrMLPPredictionHead(nn.Module):
     """
     Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
     height and width of a bounding box w.r.t. an image.
@@ -2397,7 +2397,7 @@ def forward(self, x):
 
 
 # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/matcher.py
-class ConditionalDETRHungarianMatcher(nn.Module):
+class ConditionalDetrHungarianMatcher(nn.Module):
     """
     This class computes an assignment between the targets and the predictions of the network.
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index f61b8dfdda282..d2ec5be33ceb8 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -24,7 +24,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class ConditionalDETRFeatureExtractor(metaclass=DummyObject):
+class ConditionalDetrFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
index b5e19fe005da7..2d21cafc68520 100644
--- a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -32,10 +32,10 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ConditionalDETRFeatureExtractor
+    from transformers import ConditionalDetrFeatureExtractor
 
 
-class ConditionalDETRFeatureExtractionTester(unittest.TestCase):
+class ConditionalDetrFeatureExtractionTester(unittest.TestCase):
     def __init__(
         self,
         parent,
@@ -74,7 +74,7 @@ def prepare_feat_extract_dict(self):
 
     def get_expected_values(self, image_inputs, batched=False):
         """
-        This function computes the expected height and width when providing images to ConditionalDETRFeatureExtractor,
+        This function computes the expected height and width when providing images to ConditionalDetrFeatureExtractor,
         assuming do_resize is set to True with a scalar size.
         """
         if not batched:
@@ -106,12 +106,12 @@ def get_expected_values(self, image_inputs, batched=False):
 
 @require_torch
 @require_vision
-class ConditionalDETRFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+class ConditionalDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
 
-    feature_extraction_class = ConditionalDETRFeatureExtractor if is_vision_available() else None
+    feature_extraction_class = ConditionalDetrFeatureExtractor if is_vision_available() else None
 
     def setUp(self):
-        self.feature_extract_tester = ConditionalDETRFeatureExtractionTester(self)
+        self.feature_extract_tester = ConditionalDetrFeatureExtractionTester(self)
 
     @property
     def feat_extract_dict(self):
@@ -253,7 +253,7 @@ def test_call_pytorch_with_coco_detection_annotations(self):
         target = {"image_id": 39769, "annotations": target}
 
         # encode them
-        feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
         encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
 
         # verify pixel values
@@ -300,7 +300,7 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
 
         # encode them
         # TODO replace by .from_pretrained Atten4Vis/ConditionalDETR-panoptic
-        feature_extractor = ConditionalDETRFeatureExtractor(format="coco_panoptic")
+        feature_extractor = ConditionalDetrFeatureExtractor(format="coco_panoptic")
         encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
 
         # verify pixel values
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index 38933f26c6627..fd008b9af494f 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -19,7 +19,7 @@
 import math
 import unittest
 
-from transformers import ConditionalDETRConfig, is_timm_available, is_vision_available
+from transformers import ConditionalDetrConfig, is_timm_available, is_vision_available
 from transformers.testing_utils import require_timm, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
@@ -31,16 +31,16 @@
 if is_timm_available():
     import torch
 
-    from transformers import ConditionalDETRForObjectDetection, ConditionalDETRForSegmentation, ConditionalDETRModel
+    from transformers import ConditionalDetrForObjectDetection, ConditionalDetrForSegmentation, ConditionalDetrModel
 
 
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ConditionalDETRFeatureExtractor
+    from transformers import ConditionalDetrFeatureExtractor
 
 
-class ConditionalDETRModelTester:
+class ConditionalDetrModelTester:
     def __init__(
         self,
         parent,
@@ -105,7 +105,7 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
-        return ConditionalDETRConfig(
+        return ConditionalDetrConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
             decoder_layers=self.num_hidden_layers,
@@ -125,7 +125,7 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
     def create_and_check_conditional_detr_model(self, config, pixel_values, pixel_mask, labels):
-        model = ConditionalDETRModel(config=config)
+        model = ConditionalDetrModel(config=config)
         model.to(torch_device)
         model.eval()
 
@@ -137,7 +137,7 @@ def create_and_check_conditional_detr_model(self, config, pixel_values, pixel_ma
         )
 
     def create_and_check_conditional_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
-        model = ConditionalDETRForObjectDetection(config=config)
+        model = ConditionalDetrForObjectDetection(config=config)
         model.to(torch_device)
         model.eval()
 
@@ -155,12 +155,12 @@ def create_and_check_conditional_detr_object_detection_head_model(self, config,
 
 
 @require_timm
-class ConditionalDETRModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
-            ConditionalDETRModel,
-            ConditionalDETRForObjectDetection,
-            ConditionalDETRForSegmentation,
+            ConditionalDetrModel,
+            ConditionalDetrForObjectDetection,
+            ConditionalDetrForSegmentation,
         )
         if is_timm_available()
         else ()
@@ -176,7 +176,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class.__name__ in ["ConditionalDETRForObjectDetection", "ConditionalDETRForSegmentation"]:
+            if model_class.__name__ in ["ConditionalDetrForObjectDetection", "ConditionalDetrForSegmentation"]:
                 labels = []
                 for i in range(self.model_tester.batch_size):
                     target = {}
@@ -199,8 +199,8 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         return inputs_dict
 
     def setUp(self):
-        self.model_tester = ConditionalDETRModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ConditionalDETRConfig, has_text_modality=False)
+        self.model_tester = ConditionalDetrModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ConditionalDetrConfig, has_text_modality=False)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -279,10 +279,10 @@ def test_attention_outputs(self):
                 if "labels" in inputs_dict:
                     correct_outlen += 1  # loss is added to beginning
                 # Object Detection model returns pred_logits and pred_boxes
-                if model_class.__name__ == "ConditionalDETRForObjectDetection":
+                if model_class.__name__ == "ConditionalDetrForObjectDetection":
                     correct_outlen += 1
                 # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks
-                if model_class.__name__ == "ConditionalDETRForSegmentation":
+                if model_class.__name__ == "ConditionalDetrForSegmentation":
                     correct_outlen += 2
                 if "past_key_values" in outputs:
                     correct_outlen += 1  # past_key_values have been returned
@@ -406,7 +406,7 @@ def test_different_timm_backbone(self):
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
-            if model_class.__name__ == "ConditionalDETRForObjectDetection":
+            if model_class.__name__ == "ConditionalDetrForObjectDetection":
                 expected_shape = (
                     self.model_tester.batch_size,
                     self.model_tester.num_queries,
@@ -452,17 +452,17 @@ def prepare_img():
 @require_timm
 @require_vision
 @slow
-class ConditionalDETRModelIntegrationTests(unittest.TestCase):
+class ConditionalDetrModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
         return (
-            ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+            ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
             if is_vision_available()
             else None
         )
 
     def test_inference_no_head(self):
-        model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -479,7 +479,7 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_inference_object_detection_head(self):
-        model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -505,7 +505,7 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
     def test_inference_panoptic_segmentation_head(self):
-        model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
+        model = ConditionalDetrForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
diff --git a/utils/check_repo.py b/utils/check_repo.py
index ffc47aa72af8f..8659f795602e2 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -60,8 +60,8 @@
     "DetrEncoder",  # Building part of bigger (tested) model.
     "DetrDecoder",  # Building part of bigger (tested) model.
     "DetrDecoderWrapper",  # Building part of bigger (tested) model.
-    "ConditionalDETREncoder",  # Building part of bigger (tested) model.
-    "ConditionalDETRDecoder",  # Building part of bigger (tested) model.
+    "ConditionalDetrEncoder",  # Building part of bigger (tested) model.
+    "ConditionalDetrDecoder",  # Building part of bigger (tested) model.
     "M2M100Encoder",  # Building part of bigger (tested) model.
     "M2M100Decoder",  # Building part of bigger (tested) model.
     "MCTCTEncoder",  # Building part of bigger (tested) model.

From 5194ea92997f054c3e5782bce13ae578e58578c9 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 11:23:00 -0400
Subject: [PATCH 117/233] fixed docs

---
 README.md                        | 1 -
 README_zh-hans.md                | 1 -
 README_zh-hant.md                | 1 -
 docs/source/en/index.mdx         | 3 +--
 docs/source/en/serialization.mdx | 2 +-
 5 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 1ba4614902013..ec8a0fd2e8b39 100644
--- a/README.md
+++ b/README.md
@@ -279,7 +279,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index b2bf8f6043e0d..88611a5f672bd 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -253,7 +253,6 @@ conda install -c huggingface transformers
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 发布。
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 366352ef5eb4e..b84d3ea8ca974 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -265,7 +265,6 @@ conda install -c huggingface transformers
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 579c6172d6a6b..40685a5c2fa8d 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -69,7 +69,6 @@ The documentation is organized into five sections:
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
@@ -217,7 +216,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|      conditional_detr       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|      Conditional DETR       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index f6304004b62ae..a1577447f7235 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -57,7 +57,7 @@ Ready-made configurations include the following architectures:
 - CamemBERT
 - CLIP
 - CodeGen
-- conditional_detr
+- Conditional DETR
 - ConvBERT
 - ConvNeXT
 - Data2VecText

From bbc033454f6a5729fd2a2789b6c239bd93d488b8 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 21:50:18 -0400
Subject: [PATCH 118/233] Update README_ko.md

---
 README_ko.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README_ko.md b/README_ko.md
index 51ad1d1b18693..8a2df2fcbc888 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -229,7 +229,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.

From 194276b2de8180522ac890f58285d1b6d9c0b9e6 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 12 Sep 2022 21:20:11 -0400
Subject: [PATCH 119/233] added spatial_model_name

---
 utils/check_copies.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/check_copies.py b/utils/check_copies.py
index 7d57173654468..0c8afa7b77ae8 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -477,6 +477,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "OpenAI GPT": "GPT",
     "Perceiver": "Perceiver IO",
     "ViT": "Vision Transformer (ViT)",
+    "conditional_detr": "Conditional DETR"
 }
 
 # Update this list with the models that shouldn't be in the README. This only concerns modular models or those who do

From 91bbe0794923d2231d832a1e8660b1e167bbd0cd Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 12 Sep 2022 21:24:51 -0400
Subject: [PATCH 120/233] fixed fix-copies

---
 src/transformers/models/auto/configuration_auto.py | 2 +-
 utils/check_copies.py                              | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 258fb5d645bf1..dce73cb390365 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -302,7 +302,7 @@
         ("canine", "CANINE"),
         ("clip", "CLIP"),
         ("codegen", "CodeGen"),
-        ("conditional_detr", "conditional_detr"),
+        ("conditional_detr", "Conditional DETR"),
         ("convbert", "ConvBERT"),
         ("convnext", "ConvNeXT"),
         ("cpm", "CPM"),
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 0c8afa7b77ae8..7d57173654468 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -477,7 +477,6 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "OpenAI GPT": "GPT",
     "Perceiver": "Perceiver IO",
     "ViT": "Vision Transformer (ViT)",
-    "conditional_detr": "Conditional DETR"
 }
 
 # Update this list with the models that shouldn't be in the README. This only concerns modular models or those who do

From 43968598e53bf99e3e689b46e71226e216fea370 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:44:17 -0400
Subject: [PATCH 121/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index ec324f1f72456..f072310f7f878 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for CONDITIONAL_DETR."""
+"""Feature extractor class for Conditional DETR."""
 
 import io
 import pathlib

From 763e101f09c38d2674fafd06d466693a2cf2fa23 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:44:58 -0400
Subject: [PATCH 122/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index f072310f7f878..bb000a7996b3c 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -37,7 +37,7 @@
 ImageInput = Union[Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"]]
 
 
-# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+# Copied from transformers.models.detr.feature_extraction_detr.center_to_corners_format
 def center_to_corners_format(x):
     """
     Converts a PyTorch tensor of bounding boxes of center format (center_x, center_y, width, height) to corners format

From f96899e07dca0aab5c367af6854a226d94b1258a Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:45:19 -0400
Subject: [PATCH 123/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index bb000a7996b3c..d89f3154e0bf0 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -48,6 +48,7 @@ def center_to_corners_format(x):
     return torch.stack(b, dim=-1)
 
 
+# Copied from transformers.models.detr.feature_extraction_detr.corners_to_center_format
 def corners_to_center_format(x):
     """
     Converts a NumPy array of bounding boxes of shape (number of bounding boxes, 4) of corners format (x_0, y_0, x_1,

From 5cc02c102fc227e102cb945ee34b1345c72f317c Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:45:43 -0400
Subject: [PATCH 124/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index d89f3154e0bf0..d813ab2a45e3c 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -60,6 +60,7 @@ def corners_to_center_format(x):
     return np.stack(b, axis=-1)
 
 
+# Copied from transformers.models.detr.feature_extraction_detr.masks_to_boxes
 def masks_to_boxes(masks):
     """
     Compute the bounding boxes around the provided panoptic segmentation masks.

From 5cca671b1313864a8bb42ed40fa0753848f253d3 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:46:05 -0400
Subject: [PATCH 125/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 05343a89f3ed0..3bf47082f6b30 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Attn4Vis and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 Microsoft Research Asia and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 10dab49e1197d927d010403785a7bc14eb693e0f Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:46:25 -0400
Subject: [PATCH 126/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 3bf47082f6b30..7d45259b60adc 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -57,7 +57,7 @@
 _CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
 
 CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "Atten4Vis/ConditionalDETR",
+    "microsoft/conditional-detr-resnet-50",
     # See all conditional_detr models at https://huggingface.co/models?filter=conditional_detr
 ]
 

From eecd87cd1428985cd7b6f68c8a90d26b38cd05d9 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:51:20 -0400
Subject: [PATCH 127/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 7d45259b60adc..bf4d82479375e 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -270,8 +270,6 @@ class ConditionalDetrSegmentationOutput(ModelOutput):
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-# BELOW: utilities copied from
-# https://github.com/facebookresearch/detr/blob/master/backbone.py
 # Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDetr
 class ConditionalDetrFrozenBatchNorm2d(nn.Module):
     """

From e6d50155e5b0f2d7184798fd1a9b703347b7c7e9 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:52:46 -0400
Subject: [PATCH 128/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index bf4d82479375e..8550bba968802 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -326,6 +326,7 @@ def replace_batch_norm(m, name=""):
         replace_batch_norm(ch, n)
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrTimmConvEncoder
 class ConditionalDetrTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.

From 63a00b2871551cfcd414cc6d0011c658202d7f0f Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:54:05 -0400
Subject: [PATCH 129/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 8550bba968802..415f0d47a41f5 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -503,6 +503,7 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrAttention with DETR->Conditional DETR
 class DetrAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.

From 06f394e4d5623ccc57b8ae6e54d408123611b456 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:56:48 -0400
Subject: [PATCH 130/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 415f0d47a41f5..be07eca130e23 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1112,7 +1112,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDetr
+# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL DETR,Detr->ConditionalDetr
 class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a

From ec48112a16bb3f05f47c77ea0c74eb5425ef61d5 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:57:04 -0400
Subject: [PATCH 131/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index be07eca130e23..00faac56fa1de 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1228,7 +1228,7 @@ class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
 
     The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
 
-    Some small tweaks for CONDITIONAL_DETR:
+    Some small tweaks for Conditional DETR:
 
     - position_embeddings and query_position_embeddings are added to the forward pass.
     - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.

From 81b64d73fc4414705b013c8d26405ea9b4b77977 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:57:19 -0400
Subject: [PATCH 132/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 00faac56fa1de..849502de9323a 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1243,7 +1243,7 @@ def __init__(self, config: ConditionalDetrConfig):
         self.layerdrop = config.decoder_layerdrop
 
         self.layers = nn.ModuleList([ConditionalDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
-        # in CONDITIONAL_DETR, the decoder uses layernorm after the last decoder layer output
+        # in Conditional DETR, the decoder uses layernorm after the last decoder layer output
         self.layernorm = nn.LayerNorm(config.d_model)
         d_model = config.d_model
         self.gradient_checkpointing = False

From be85f014812555f5a9ba2356286f2e09296f0ad9 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:57:42 -0400
Subject: [PATCH 133/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 849502de9323a..a063c1c2e46e4 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1437,7 +1437,7 @@ def custom_forward(*inputs):
 
 @add_start_docstrings(
     """
-    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+    The bare Conditional DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
     hidden-states without any specific head on top.
     """,
     CONDITIONAL_DETR_START_DOCSTRING,

From fcc2970d2a5e41f96b3119af89a04552cbc82485 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 10:26:13 -0400
Subject: [PATCH 134/233] added some copied from

---
 .../feature_extraction_conditional_detr.py    | 29 +++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index d813ab2a45e3c..1421e82aa789a 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -94,9 +94,7 @@ def masks_to_boxes(masks):
     return np.stack([x_min, y_min, x_max, y_max], 1)
 
 
-# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
-# Copyright (c) 2018, Alexander Kirillov
-# All rights reserved.
+# Copied from transformers.models.detr.feature_extraction_detr.rgb_to_id
 def rgb_to_id(color):
     if isinstance(color, np.ndarray) and len(color.shape) == 3:
         if color.dtype == np.uint8:
@@ -104,7 +102,7 @@ def rgb_to_id(color):
         return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
-
+# Copied from transformers.models.detr.feature_extraction_detr.id_to_rgb
 def id_to_rgb(id_map):
     if isinstance(id_map, np.ndarray):
         id_map_copy = id_map.copy()
@@ -123,7 +121,7 @@ def id_to_rgb(id_map):
 
 class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
-    Constructs a CONDITIONAL_DETR feature extractor.
+    Constructs a CONDITIONAL DETR feature extractor.
 
     This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
     should refer to this superclass for more information regarding those methods.
@@ -188,7 +186,7 @@ def prepare(self, image, target, return_segmentation_masks=False, masks_path=Non
         else:
             raise ValueError(f"Format {self.format} not supported")
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.convert_coco_poly_to_mask
     def convert_coco_poly_to_mask(self, segmentations, height, width):
 
         try:
@@ -212,10 +210,10 @@ def convert_coco_poly_to_mask(self, segmentations, height, width):
 
         return masks
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_detection with DETR->Conditional DETR
     def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
         """
-        Convert the target in COCO format into the format expected by CONDITIONAL_DETR.
+        Convert the target in COCO format into the format expected by CONDITIONAL DETR.
         """
         w, h = image.size
 
@@ -277,6 +275,7 @@ def prepare_coco_detection(self, image, target, return_segmentation_masks=False)
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_panoptic
     def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
         w, h = image.size
         ann_info = target.copy()
@@ -310,6 +309,7 @@ def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
     def _resize(self, image, size, target=None, max_size=None):
         """
         Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
@@ -380,6 +380,7 @@ def get_size(image_size, size, max_size=None):
 
         return rescaled_image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
     def _normalize(self, image, mean, std, target=None):
         """
         Normalize the image with a certain mean and std.
@@ -401,7 +402,8 @@ def _normalize(self, image, mean, std, target=None):
             target["boxes"] = boxes
 
         return image, target
-
+    
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._call with DETR->Conditional DETR,Detr->ConditionalDetr
     def __call__(
         self,
         images: ImageInput,
@@ -549,7 +551,7 @@ def __call__(
             if annotations is not None:
                 annotations = [annotations]
 
-        # prepare (COCO annotations as a list of Dict -> CONDITIONAL_DETR target as a single Dict per image)
+        # prepare (COCO annotations as a list of Dict -> CONDITIONAL DETR target as a single Dict per image)
         if annotations is not None:
             for idx, (image, target) in enumerate(zip(images, annotations)):
                 if not isinstance(image, Image.Image):
@@ -624,6 +626,7 @@ def __call__(
 
         return encoded_inputs
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
     def _max_by_axis(self, the_list):
         # type: (List[List[int]]) -> List[int]
         maxes = the_list[0]
@@ -632,6 +635,7 @@ def _max_by_axis(self, the_list):
                 maxes[index] = max(maxes[index], item)
         return maxes
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.pad_and_create_pixel_mask
     def pad_and_create_pixel_mask(
         self, pixel_values_list: List["torch.Tensor"], return_tensors: Optional[Union[str, TensorType]] = None
     ):
@@ -717,6 +721,7 @@ def post_process(self, outputs, target_sizes):
 
         return results
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_segmentation with Detr->ConditionalDetr
     def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
         """
         Converts the output of [`ConditionalDetrForSegmentation`] into image segmentation predictions. Only supports
@@ -759,7 +764,7 @@ def to_tuple(tup):
             preds.append(predictions)
         return preds
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_instance with Detr->ConditionalDetr
     def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
         """
         Converts the output of [`ConditionalDetrForSegmentation`] into actual instance segmentation predictions. Only
@@ -803,7 +808,7 @@ def post_process_instance(self, results, outputs, orig_target_sizes, max_target_
 
         return results
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_panoptic with Detr->ConditionalDetr
     def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
         """
         Converts the output of [`ConditionalDetrForSegmentation`] into actual panoptic predictions. Only supports

From 1f3b2ce121d8a61d2009d4a8c30a6436ae2da92c Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 10:27:39 -0400
Subject: [PATCH 135/233] added some copied from

---
 .../conditional_detr/feature_extraction_conditional_detr.py   | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 1421e82aa789a..812a0fe103cea 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -309,7 +309,6 @@ def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
 
         return image, target
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
     def _resize(self, image, size, target=None, max_size=None):
         """
         Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
@@ -380,7 +379,6 @@ def get_size(image_size, size, max_size=None):
 
         return rescaled_image, target
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
     def _normalize(self, image, mean, std, target=None):
         """
         Normalize the image with a certain mean and std.
@@ -403,7 +401,6 @@ def _normalize(self, image, mean, std, target=None):
 
         return image, target
     
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._call with DETR->Conditional DETR,Detr->ConditionalDetr
     def __call__(
         self,
         images: ImageInput,
@@ -626,7 +623,6 @@ def __call__(
 
         return encoded_inputs
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
     def _max_by_axis(self, the_list):
         # type: (List[List[int]]) -> List[int]
         maxes = the_list[0]

From 098b2217846e27a718577e2bf465a57d96ce4c26 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 11:40:45 -0400
Subject: [PATCH 136/233] added some copied from

---
 .../feature_extraction_conditional_detr.py    |  3 +-
 .../modeling_conditional_detr.py              | 52 +++++++++++--------
 ...est_feature_extraction_conditional_detr.py |  4 +-
 .../test_modeling_conditional_detr.py         |  8 +--
 4 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 812a0fe103cea..c9f5d8124e539 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -210,10 +210,9 @@ def convert_coco_poly_to_mask(self, segmentations, height, width):
 
         return masks
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_detection with DETR->Conditional DETR
     def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
         """
-        Convert the target in COCO format into the format expected by CONDITIONAL DETR.
+        Convert the target in COCO format into the format expected by Conditional DETR.
         """
         w, h = image.size
 
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index a063c1c2e46e4..435a80b220b3b 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -54,11 +54,11 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "ConditionalDetrConfig"
-_CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
+_CHECKPOINT_FOR_DOC = "microsoft/conditional-detr-resnet-50"
 
 CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "microsoft/conditional-detr-resnet-50",
-    # See all conditional_detr models at https://huggingface.co/models?filter=conditional_detr
+    # See all Conditional DETR models at https://huggingface.co/models?filter=conditional_detr
 ]
 
 
@@ -331,11 +331,11 @@ class ConditionalDetrTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.
 
-    nn.BatchNorm2d layers are replaced by ConditionalDetrFrozenBatchNorm2d as defined above.
+    nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.
 
     """
 
-    def __init__(self, name: str, dilation: bool):
+    def __init__(self, name: str, dilation: bool, use_pretrained_backbone: bool, num_channels: int = 3):
         super().__init__()
 
         kwargs = {}
@@ -344,7 +344,14 @@ def __init__(self, name: str, dilation: bool):
 
         requires_backends(self, ["timm"])
 
-        backbone = create_model(name, pretrained=True, features_only=True, out_indices=(1, 2, 3, 4), **kwargs)
+        backbone = create_model(
+            name,
+            pretrained=use_pretrained_backbone,
+            features_only=True,
+            out_indices=(1, 2, 3, 4),
+            in_chans=num_channels,
+            **kwargs,
+        )
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
@@ -482,7 +489,7 @@ def build_position_encoding(config):
 
 
 # function to generate sine positional embedding for 2d coordinates
-def gen_sineembed_for_position(pos_tensor):
+def gen_sine_position_embeddings(pos_tensor):
     scale = 2 * math.pi
     dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
     dim_t = 10000 ** (2 * (dim_t // 2) / 128)
@@ -503,12 +510,11 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrAttention with DETR->Conditional DETR
 class DetrAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
 
-    Here, we add position embeddings to the queries and keys (as explained in the CONDITIONAL_DETR paper).
+    Here, we add position embeddings to the queries and keys (as explained in the Conditional DETR paper).
     """
 
     def __init__(
@@ -1112,7 +1118,6 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL DETR,Detr->ConditionalDetr
 class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
@@ -1120,7 +1125,7 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
 
     The encoder updates the flattened feature map through multiple self-attention layers.
 
-    Small tweak for CONDITIONAL_DETR:
+    Small tweak for Conditional DETR:
 
     - position_embeddings are added to the forward pass.
 
@@ -1136,7 +1141,7 @@ def __init__(self, config: ConditionalDetrConfig):
 
         self.layers = nn.ModuleList([ConditionalDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
 
-        # in the original CONDITIONAL_DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
+        # in the original Conditional DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1341,7 +1346,7 @@ def forward(
         reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
         obj_center = reference_points[..., :2].transpose(0, 1)
         # get sine embedding for the query vector
-        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center)
+        query_sine_embed_before_transformation = gen_sine_position_embeddings(obj_center)
 
         for idx, decoder_layer in enumerate(self.layers):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1503,8 +1508,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
+        >>> model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50")
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
@@ -1606,7 +1611,7 @@ class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
     def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
-        # CONDITIONAL_DETR encoder-decoder model
+        # CONDITIONAL DETR encoder-decoder model
         self.model = ConditionalDetrModel(config)
 
         # Object detection heads
@@ -1662,8 +1667,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
+        >>> model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -2374,7 +2379,7 @@ def forward(self, outputs, targets):
         return losses
 
 
-# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->ConditionalDetr
 class ConditionalDetrMLPPredictionHead(nn.Module):
     """
     Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
@@ -2396,7 +2401,6 @@ def forward(self, x):
         return x
 
 
-# taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/matcher.py
 class ConditionalDetrHungarianMatcher(nn.Module):
     """
     This class computes an assignment between the targets and the predictions of the network.
@@ -2421,7 +2425,7 @@ def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float
         self.class_cost = class_cost
         self.bbox_cost = bbox_cost
         self.giou_cost = giou_cost
-        if class_cost == 0 or bbox_cost == 0 or giou_cost == 0:
+        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
             raise ValueError("All costs of the Matcher can't be 0")
 
     @torch.no_grad()
@@ -2488,6 +2492,7 @@ def _upcast(t: Tensor) -> Tensor:
         return t if t.dtype in (torch.int32, torch.int64) else t.int()
 
 
+# Copied from transformers.models.detr.modeling_detr.box_area
 def box_area(boxes: Tensor) -> Tensor:
     """
     Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
@@ -2504,7 +2509,7 @@ def box_area(boxes: Tensor) -> Tensor:
     return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
 
 
-# modified from torchvision to also return the union
+# Copied from transformers.models.detr.modeling_detr.box_iou
 def box_iou(boxes1, boxes2):
     area1 = box_area(boxes1)
     area2 = box_area(boxes2)
@@ -2521,6 +2526,7 @@ def box_iou(boxes1, boxes2):
     return iou, union
 
 
+# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
 def generalized_box_iou(boxes1, boxes2):
     """
     Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
@@ -2554,7 +2560,7 @@ def _max_by_axis(the_list):
             maxes[index] = max(maxes[index], item)
     return maxes
 
-
+# Copied from transformers.models.detr.modeling_detr.NestedTensor
 class NestedTensor(object):
     def __init__(self, tensors, mask: Optional[Tensor]):
         self.tensors = tensors
@@ -2575,7 +2581,7 @@ def decompose(self):
     def __repr__(self):
         return str(self.tensors)
 
-
+# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
 def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     if tensor_list[0].ndim == 3:
         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
index 2d21cafc68520..371d97de0000b 100644
--- a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -253,7 +253,7 @@ def test_call_pytorch_with_coco_detection_annotations(self):
         target = {"image_id": 39769, "annotations": target}
 
         # encode them
-        feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
         encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
 
         # verify pixel values
@@ -299,7 +299,7 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
         masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
 
         # encode them
-        # TODO replace by .from_pretrained Atten4Vis/ConditionalDETR-panoptic
+        # TODO replace by .from_pretrained microsoft/conditional-detr-resnet-50-panoptic
         feature_extractor = ConditionalDetrFeatureExtractor(format="coco_panoptic")
         encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
 
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index fd008b9af494f..2fb4bb562d5bf 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -456,13 +456,13 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
         return (
-            ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+            ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
             if is_vision_available()
             else None
         )
 
     def test_inference_no_head(self):
-        model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -479,7 +479,7 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_inference_object_detection_head(self):
-        model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -505,7 +505,7 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
     def test_inference_panoptic_segmentation_head(self):
-        model = ConditionalDetrForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
+        model = ConditionalDetrForSegmentation.from_pretrained("microsoft/conditional-detr-resnet-50-panoptic").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()

From b46d9651930bf6cd0ddfa9870c11dd694bbd82cf Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 11:43:10 -0400
Subject: [PATCH 137/233] added some copied from

---
 .../feature_extraction_conditional_detr.py                | 3 ++-
 .../models/conditional_detr/modeling_conditional_detr.py  | 2 ++
 .../conditional_detr/test_modeling_conditional_detr.py    | 8 ++++++--
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index c9f5d8124e539..922cfaf09e98e 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -102,6 +102,7 @@ def rgb_to_id(color):
         return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
+
 # Copied from transformers.models.detr.feature_extraction_detr.id_to_rgb
 def id_to_rgb(id_map):
     if isinstance(id_map, np.ndarray):
@@ -399,7 +400,7 @@ def _normalize(self, image, mean, std, target=None):
             target["boxes"] = boxes
 
         return image, target
-    
+
     def __call__(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 435a80b220b3b..a24a306b65e9f 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -2560,6 +2560,7 @@ def _max_by_axis(the_list):
             maxes[index] = max(maxes[index], item)
     return maxes
 
+
 # Copied from transformers.models.detr.modeling_detr.NestedTensor
 class NestedTensor(object):
     def __init__(self, tensors, mask: Optional[Tensor]):
@@ -2581,6 +2582,7 @@ def decompose(self):
     def __repr__(self):
         return str(self.tensors)
 
+
 # Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
 def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     if tensor_list[0].ndim == 3:
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index 2fb4bb562d5bf..a67f0bc30638e 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -479,7 +479,9 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_inference_object_detection_head(self):
-        model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)
+        model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50").to(
+            torch_device
+        )
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -505,7 +507,9 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
     def test_inference_panoptic_segmentation_head(self):
-        model = ConditionalDetrForSegmentation.from_pretrained("microsoft/conditional-detr-resnet-50-panoptic").to(torch_device)
+        model = ConditionalDetrForSegmentation.from_pretrained("microsoft/conditional-detr-resnet-50-panoptic").to(
+            torch_device
+        )
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()

From b944f0f34d57d3f3d329c48c9d20d582bc033bee Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 14:52:49 -0400
Subject: [PATCH 138/233] fixed use_pretrained issue

---
 .../models/conditional_detr/configuration_conditional_detr.py | 4 ++++
 .../models/conditional_detr/modeling_conditional_detr.py      | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 6abcf85894735..df6673e9ebfef 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -134,6 +134,7 @@ class ConditionalDetrConfig(PretrainedConfig):
 
     def __init__(
         self,
+        num_channels=3,
         num_queries=300,
         max_position_embeddings=1024,
         encoder_layers=6,
@@ -157,6 +158,7 @@ def __init__(
         auxiliary_loss=False,
         position_embedding_type="sine",
         backbone="resnet50",
+        use_pretrained_backbone=True,
         dilation=False,
         class_cost=2,
         bbox_cost=5,
@@ -169,6 +171,7 @@ def __init__(
         focal_alpha=0.25,
         **kwargs
     ):
+        self.num_channels = num_channels
         self.num_queries = num_queries
         self.max_position_embeddings = max_position_embeddings
         self.d_model = d_model
@@ -191,6 +194,7 @@ def __init__(
         self.auxiliary_loss = auxiliary_loss
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
         self.dilation = dilation
         # Hungarian matcher
         self.class_cost = class_cost
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index a24a306b65e9f..9e70e1b873cd6 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1452,7 +1452,9 @@ def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # Create backbone + positional encoding
-        backbone = ConditionalDetrTimmConvEncoder(config.backbone, config.dilation)
+        backbone = ConditionalDetrTimmConvEncoder(
+            config.backbone, config.dilation, config.use_pretrained_backbone, config.num_channels
+        )
         position_embeddings = build_position_encoding(config)
         self.backbone = ConditionalDetrConvModel(backbone, position_embeddings)
 

From 97d773321eac6fb32bd4da4be33b70939941681d Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 16 Sep 2022 10:35:52 -0400
Subject: [PATCH 139/233] changed post-process

---
 .../conditional_detr/feature_extraction_conditional_detr.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 922cfaf09e98e..be85334d7b354 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -701,7 +701,7 @@ def post_process(self, outputs, target_sizes):
             raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
 
         prob = out_logits.sigmoid()
-        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 300, dim=1)
         scores = topk_values
         topk_boxes = topk_indexes // out_logits.shape[2]
         labels = topk_indexes % out_logits.shape[2]

From 72e8e7b7baa592e9a26c3093f2a9f8fe1a86855b Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:47:29 -0400
Subject: [PATCH 140/233] added conditional_detr files

---
 src/transformers/__init__.py                               | 7 +++++++
 src/transformers/models/auto/feature_extraction_auto.py    | 2 ++
 src/transformers/models/auto/modeling_auto.py              | 1 +
 .../models/conditional_detr/modeling_conditional_detr.py   | 2 --
 4 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 6abc53c85008e..639d33c4754be 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3553,6 +3553,13 @@
             DeformableDetrModel,
             DeformableDetrPreTrainedModel,
         )
+        from .models.conditional_detr import (
+            CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ConditionalDETRForObjectDetection,
+            ConditionalDETRForSegmentation,
+            ConditionalDETRModel,
+            ConditionalDETRPreTrainedModel,
+        )
         from .models.detr import (
             DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
             DetrForObjectDetection,
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 015fd132ef0dc..9ba8295b406da 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,6 +39,8 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
+        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
+        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 8ee36125de25d..ca4d2d8876f01 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -459,6 +459,7 @@
         # Model for Object Detection mapping
         ("conditional_detr", "ConditionalDetrForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
+        ("conditional_detr", "ConditionalDETRForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
     ]
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 9e70e1b873cd6..ba958b6af9040 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -502,7 +502,6 @@ def gen_sine_position_embeddings(pos_tensor):
     pos = torch.cat((pos_y, pos_x), dim=2)
     return pos
 
-
 def inverse_sigmoid(x, eps=1e-5):
     x = x.clamp(min=0, max=1)
     x1 = x.clamp(min=eps)
@@ -679,7 +678,6 @@ def __init__(
 
     def _qk_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
     def _v_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim).transpose(1, 2).contiguous()
 

From ae286ae1fdac2ed353ad7e621c664e0428f13a4f Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:54:36 -0400
Subject: [PATCH 141/233] checked copies

---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index ba958b6af9040..dd501e8fe4a1c 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1695,7 +1695,7 @@ def forward(
 
         # class logits + predicted bounding boxes
         logits = self.class_labels_classifier(sequence_output)
-
+        
         reference = outputs.reference_points if return_dict else outputs[-1]
         reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
         outputs_coords = []

From 0b835dcab8962ec57547cf45313ac549d8447099 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 01:22:01 -0400
Subject: [PATCH 142/233] fixed style and copies

---
 docs/source/en/_toctree.yml                                 | 6 +++---
 .../models/conditional_detr/modeling_conditional_detr.py    | 4 +++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index a4cd1005e3e83..d158be23cea58 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -42,7 +42,8 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - sections:
+    - isExpanded: false
+      sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -58,7 +59,6 @@
       - local: tasks/multiple_choice
         title: Multiple choice
       title: Task guides
-      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
@@ -503,4 +503,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
\ No newline at end of file
+  title: API
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index dd501e8fe4a1c..9e70e1b873cd6 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -502,6 +502,7 @@ def gen_sine_position_embeddings(pos_tensor):
     pos = torch.cat((pos_y, pos_x), dim=2)
     return pos
 
+
 def inverse_sigmoid(x, eps=1e-5):
     x = x.clamp(min=0, max=1)
     x1 = x.clamp(min=eps)
@@ -678,6 +679,7 @@ def __init__(
 
     def _qk_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
     def _v_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim).transpose(1, 2).contiguous()
 
@@ -1695,7 +1697,7 @@ def forward(
 
         # class logits + predicted bounding boxes
         logits = self.class_labels_classifier(sequence_output)
-        
+
         reference = outputs.reference_points if return_dict else outputs[-1]
         reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
         outputs_coords = []

From f69d13f19a9145486298511c934d16bffb246c67 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:48:44 -0400
Subject: [PATCH 143/233] fixed some doc issue

---
 docs/source/en/_toctree.yml                             | 6 +++---
 src/transformers/models/auto/feature_extraction_auto.py | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index d158be23cea58..dfbdd503e6005 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -42,8 +42,7 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - isExpanded: false
-      sections:
+    - sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -58,7 +57,8 @@
         title: Summarization
       - local: tasks/multiple_choice
         title: Multiple choice
-      title: Task guides
+      title: Task guides      
+      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 9ba8295b406da..015fd132ef0dc 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,8 +39,6 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
-        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
-        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),

From 72d788d0b15aadff7395048b87680549d5b04c6b Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 11:19:56 -0400
Subject: [PATCH 144/233] changed prefix to ConditionalDetr

---
 README.md                    | 1 +
 docs/source/en/_toctree.yml  | 4 ++--
 docs/source/en/index.mdx     | 1 +
 src/transformers/__init__.py | 8 ++++----
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index ec8a0fd2e8b39..1ba4614902013 100644
--- a/README.md
+++ b/README.md
@@ -279,6 +279,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index dfbdd503e6005..a4cd1005e3e83 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -57,7 +57,7 @@
         title: Summarization
       - local: tasks/multiple_choice
         title: Multiple choice
-      title: Task guides      
+      title: Task guides
       isExpanded: false
     title: Natural Language Processing
   - sections:
@@ -503,4 +503,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
+  title: API
\ No newline at end of file
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 40685a5c2fa8d..a705cb447248c 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -69,6 +69,7 @@ The documentation is organized into five sections:
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 639d33c4754be..d2efd10af9141 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3555,10 +3555,10 @@
         )
         from .models.conditional_detr import (
             CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ConditionalDETRForObjectDetection,
-            ConditionalDETRForSegmentation,
-            ConditionalDETRModel,
-            ConditionalDETRPreTrainedModel,
+            ConditionalDetrForObjectDetection,
+            ConditionalDetrForSegmentation,
+            ConditionalDetrModel,
+            ConditionalDetrPreTrainedModel,
         )
         from .models.detr import (
             DETR_PRETRAINED_MODEL_ARCHIVE_LIST,

From b0eae03056a727c8502a77585ceaa90dd2b1f4e4 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 11:23:00 -0400
Subject: [PATCH 145/233] fixed docs

---
 README.md                | 1 -
 docs/source/en/index.mdx | 1 -
 2 files changed, 2 deletions(-)

diff --git a/README.md b/README.md
index 1ba4614902013..ec8a0fd2e8b39 100644
--- a/README.md
+++ b/README.md
@@ -279,7 +279,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index a705cb447248c..40685a5c2fa8d 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -69,7 +69,6 @@ The documentation is organized into five sections:
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.

From 4d1c90f095123ed76ab6627f9c54fb4fcd88cf1e Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 12 Sep 2022 21:20:11 -0400
Subject: [PATCH 146/233] added spatial_model_name

---
 utils/check_copies.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/check_copies.py b/utils/check_copies.py
index 7d57173654468..0c8afa7b77ae8 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -477,6 +477,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "OpenAI GPT": "GPT",
     "Perceiver": "Perceiver IO",
     "ViT": "Vision Transformer (ViT)",
+    "conditional_detr": "Conditional DETR"
 }
 
 # Update this list with the models that shouldn't be in the README. This only concerns modular models or those who do

From 55214b78319d5ce1c879d525c5c1e4ddc4f98d71 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 12 Sep 2022 21:24:51 -0400
Subject: [PATCH 147/233] fixed fix-copies

---
 utils/check_copies.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/check_copies.py b/utils/check_copies.py
index 0c8afa7b77ae8..7d57173654468 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -477,7 +477,6 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "OpenAI GPT": "GPT",
     "Perceiver": "Perceiver IO",
     "ViT": "Vision Transformer (ViT)",
-    "conditional_detr": "Conditional DETR"
 }
 
 # Update this list with the models that shouldn't be in the README. This only concerns modular models or those who do

From 07612c43b8262e2f84e32aeb72aa9e4d1174f42d Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:54:05 -0400
Subject: [PATCH 148/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 9e70e1b873cd6..747f514737268 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -510,6 +510,7 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrAttention with DETR->Conditional DETR
 class DetrAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.

From b31aec9aeda4e485b3737fba4dd68b419331e948 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 10:26:13 -0400
Subject: [PATCH 149/233] added some copied from

---
 .../feature_extraction_conditional_detr.py                 | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index be85334d7b354..4405a8d81b4b4 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -102,7 +102,6 @@ def rgb_to_id(color):
         return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
-
 # Copied from transformers.models.detr.feature_extraction_detr.id_to_rgb
 def id_to_rgb(id_map):
     if isinstance(id_map, np.ndarray):
@@ -309,6 +308,7 @@ def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
     def _resize(self, image, size, target=None, max_size=None):
         """
         Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
@@ -379,6 +379,7 @@ def get_size(image_size, size, max_size=None):
 
         return rescaled_image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
     def _normalize(self, image, mean, std, target=None):
         """
         Normalize the image with a certain mean and std.
@@ -400,7 +401,8 @@ def _normalize(self, image, mean, std, target=None):
             target["boxes"] = boxes
 
         return image, target
-
+    
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._call with DETR->Conditional DETR,Detr->ConditionalDetr
     def __call__(
         self,
         images: ImageInput,
@@ -623,6 +625,7 @@ def __call__(
 
         return encoded_inputs
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
     def _max_by_axis(self, the_list):
         # type: (List[List[int]]) -> List[int]
         maxes = the_list[0]

From ef55c0e89478e2ff529abffe370af68fc149abb7 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 10:27:39 -0400
Subject: [PATCH 150/233] added some copied from

---
 .../conditional_detr/feature_extraction_conditional_detr.py   | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 4405a8d81b4b4..9d09148c6b1fd 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -308,7 +308,6 @@ def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
 
         return image, target
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
     def _resize(self, image, size, target=None, max_size=None):
         """
         Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
@@ -379,7 +378,6 @@ def get_size(image_size, size, max_size=None):
 
         return rescaled_image, target
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
     def _normalize(self, image, mean, std, target=None):
         """
         Normalize the image with a certain mean and std.
@@ -402,7 +400,6 @@ def _normalize(self, image, mean, std, target=None):
 
         return image, target
     
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._call with DETR->Conditional DETR,Detr->ConditionalDetr
     def __call__(
         self,
         images: ImageInput,
@@ -625,7 +622,6 @@ def __call__(
 
         return encoded_inputs
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
     def _max_by_axis(self, the_list):
         # type: (List[List[int]]) -> List[int]
         maxes = the_list[0]

From 967fd7e5deea688257f1170fa08a1525fd826a55 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 11:40:45 -0400
Subject: [PATCH 151/233] added some copied from

---
 .../models/conditional_detr/modeling_conditional_detr.py       | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 747f514737268..9ea9490c9befc 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -510,7 +510,6 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrAttention with DETR->Conditional DETR
 class DetrAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
@@ -2563,7 +2562,6 @@ def _max_by_axis(the_list):
             maxes[index] = max(maxes[index], item)
     return maxes
 
-
 # Copied from transformers.models.detr.modeling_detr.NestedTensor
 class NestedTensor(object):
     def __init__(self, tensors, mask: Optional[Tensor]):
@@ -2585,7 +2583,6 @@ def decompose(self):
     def __repr__(self):
         return str(self.tensors)
 
-
 # Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
 def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     if tensor_list[0].ndim == 3:

From 923fe3beefbc7b0eddab0bb0ef3e74e84fc067ff Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 11:43:10 -0400
Subject: [PATCH 152/233] added some copied from

---
 .../conditional_detr/feature_extraction_conditional_detr.py    | 3 ++-
 .../models/conditional_detr/modeling_conditional_detr.py       | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 9d09148c6b1fd..be85334d7b354 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -102,6 +102,7 @@ def rgb_to_id(color):
         return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
+
 # Copied from transformers.models.detr.feature_extraction_detr.id_to_rgb
 def id_to_rgb(id_map):
     if isinstance(id_map, np.ndarray):
@@ -399,7 +400,7 @@ def _normalize(self, image, mean, std, target=None):
             target["boxes"] = boxes
 
         return image, target
-    
+
     def __call__(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 9ea9490c9befc..9e70e1b873cd6 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -2562,6 +2562,7 @@ def _max_by_axis(the_list):
             maxes[index] = max(maxes[index], item)
     return maxes
 
+
 # Copied from transformers.models.detr.modeling_detr.NestedTensor
 class NestedTensor(object):
     def __init__(self, tensors, mask: Optional[Tensor]):
@@ -2583,6 +2584,7 @@ def decompose(self):
     def __repr__(self):
         return str(self.tensors)
 
+
 # Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
 def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     if tensor_list[0].ndim == 3:

From 1b15d79d96b489ca90e0d48087b59b1a1f81bd1f Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 19 Sep 2022 11:05:44 -0400
Subject: [PATCH 153/233] fix style quality and copies

---
 src/transformers/__init__.py                  |  6 +++
 src/transformers/models/auto/modeling_auto.py |  1 +
 .../modeling_conditional_detr.py              | 44 ++++++++++---------
 3 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d2efd10af9141..344e842114ea7 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3560,6 +3560,12 @@
             ConditionalDetrModel,
             ConditionalDetrPreTrainedModel,
         )
+        from .models.deformable_detr import (
+            DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DeformableDetrForObjectDetection,
+            DeformableDetrModel,
+            DeformableDetrPreTrainedModel,
+        )
         from .models.detr import (
             DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
             DetrForObjectDetection,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index ca4d2d8876f01..2ae4542d58bf6 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -460,6 +460,7 @@
         ("conditional_detr", "ConditionalDetrForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
         ("conditional_detr", "ConditionalDETRForObjectDetection"),
+        ("deformable_detr", "DeformableDetrForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
     ]
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 9e70e1b873cd6..a8d272f312cd2 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -280,7 +280,7 @@ class ConditionalDetrFrozenBatchNorm2d(nn.Module):
     """
 
     def __init__(self, n):
-        super(ConditionalDetrFrozenBatchNorm2d, self).__init__()
+        super().__init__()
         self.register_buffer("weight", torch.ones(n))
         self.register_buffer("bias", torch.zeros(n))
         self.register_buffer("running_mean", torch.zeros(n))
@@ -293,7 +293,7 @@ def _load_from_state_dict(
         if num_batches_tracked_key in state_dict:
             del state_dict[num_batches_tracked_key]
 
-        super(ConditionalDetrFrozenBatchNorm2d, self)._load_from_state_dict(
+        super()._load_from_state_dict(
             state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
         )
 
@@ -398,14 +398,14 @@ def forward(self, pixel_values, pixel_mask):
 
 
 # Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDetr
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[int] = None):
     """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`.
     """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
+    batch_size, source_len = mask.size()
+    target_len = target_len if target_len is not None else source_len
 
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_len, source_len).to(dtype)
 
     inverted_mask = 1.0 - expanded_mask
 
@@ -462,12 +462,12 @@ def __init__(self, embedding_dim=256):
         self.column_embeddings = nn.Embedding(50, embedding_dim)
 
     def forward(self, pixel_values, pixel_mask=None):
-        h, w = pixel_values.shape[-2:]
-        i = torch.arange(w, device=pixel_values.device)
-        j = torch.arange(h, device=pixel_values.device)
-        x_emb = self.column_embeddings(i)
-        y_emb = self.row_embeddings(j)
-        pos = torch.cat([x_emb.unsqueeze(0).repeat(h, 1, 1), y_emb.unsqueeze(1).repeat(1, w, 1)], dim=-1)
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
         pos = pos.permute(2, 0, 1)
         pos = pos.unsqueeze(0)
         pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
@@ -2538,15 +2538,17 @@ def generalized_box_iou(boxes1, boxes2):
     """
     # degenerate boxes gives inf / nan results
     # so do an early check
-    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
-    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
+        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
+    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
+        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
     iou, union = box_iou(boxes1, boxes2)
 
-    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
-    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 
-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
-    area = wh[:, :, 0] * wh[:, :, 1]
+    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
+    area = width_height[:, :, 0] * width_height[:, :, 1]
 
     return iou - (area - union) / area
 
@@ -2590,11 +2592,11 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     if tensor_list[0].ndim == 3:
         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
         batch_shape = [len(tensor_list)] + max_size
-        b, c, h, w = batch_shape
+        batch_size, num_channels, height, width = batch_shape
         dtype = tensor_list[0].dtype
         device = tensor_list[0].device
         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
-        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
         for img, pad_img, m in zip(tensor_list, tensor, mask):
             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
             m[: img.shape[1], : img.shape[2]] = False

From 92fc3df930d7e09bf5746f444f655b4896361ba7 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 19 Sep 2022 14:15:53 -0400
Subject: [PATCH 154/233] fix style quality and copies

---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index a8d272f312cd2..91c1a71d6aa82 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1330,7 +1330,7 @@ def forward(
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, target_len=input_shape[-1])
 
         # optional intermediate hidden states
         intermediate = () if self.config.auxiliary_loss else None

From 6696ec86a761bc8da1bf15c46e363e793a2644b5 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 19 Sep 2022 14:22:24 -0400
Subject: [PATCH 155/233] fix style quality and copies

---
 .../models/conditional_detr/modeling_conditional_detr.py      | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 91c1a71d6aa82..20aa578737d70 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1330,7 +1330,9 @@ def forward(
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, target_len=input_shape[-1])
+            encoder_attention_mask = _expand_mask(
+                encoder_attention_mask, inputs_embeds.dtype, target_len=input_shape[-1]
+            )
 
         # optional intermediate hidden states
         intermediate = () if self.config.auxiliary_loss else None

From a3c45b382793577ed2ef1157a965bb5bfad71558 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 10:18:01 -0400
Subject: [PATCH 156/233] add more fix-copies

---
 .../conditional_detr/feature_extraction_conditional_detr.py    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index be85334d7b354..dedaa5ccda3df 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -211,9 +211,10 @@ def convert_coco_poly_to_mask(self, segmentations, height, width):
 
         return masks
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_detection with DETR->ConditionalDETR
     def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
         """
-        Convert the target in COCO format into the format expected by Conditional DETR.
+        Convert the target in COCO format into the format expected by ConditionalDETR.
         """
         w, h = image.size
 

From 37e2b259bf3c5471be8e6774ab4b917a3aca1a6d Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 11:36:53 -0400
Subject: [PATCH 157/233] fixed some variable names & added more fix-copies

---
 src/transformers/models/auto/modeling_auto.py |  1 -
 .../feature_extraction_conditional_detr.py    | 10 +++++-
 .../modeling_conditional_detr.py              | 26 +++++++-------
 ...est_feature_extraction_conditional_detr.py | 36 +++++++++----------
 .../test_modeling_conditional_detr.py         |  2 +-
 5 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 2ae4542d58bf6..9c1136744e665 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -374,7 +374,6 @@
     [
         # Do not add new models here, this class will be deprecated in the future.
         # Model for Image Segmentation mapping
-        ("conditional_detr", "ConditionalDetrForSegmentation"),
         ("detr", "DetrForSegmentation"),
     ]
 )
diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index dedaa5ccda3df..f625a034d294b 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -152,6 +152,8 @@ class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtrac
 
     model_input_names = ["pixel_values", "pixel_mask"]
 
+
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.__init__
     def __init__(
         self,
         format="coco_detection",
@@ -172,11 +174,13 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else [0.485, 0.456, 0.406]  # ImageNet mean
         self.image_std = image_std if image_std is not None else [0.229, 0.224, 0.225]  # ImageNet std
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._is_valid_format
     def _is_valid_format(self, format):
         if format not in ["coco_detection", "coco_panoptic"]:
             raise ValueError(f"Format {format} not supported")
         return format
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare
     def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
         if self.format == "coco_detection":
             image, target = self.prepare_coco_detection(image, target, return_segmentation_masks)
@@ -310,6 +314,7 @@ def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
     def _resize(self, image, size, target=None, max_size=None):
         """
         Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
@@ -380,6 +385,7 @@ def get_size(image_size, size, max_size=None):
 
         return rescaled_image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
     def _normalize(self, image, mean, std, target=None):
         """
         Normalize the image with a certain mean and std.
@@ -402,6 +408,7 @@ def _normalize(self, image, mean, std, target=None):
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.__call__ with Detr->ConditionalDetr,DETR->ConditionalDETR
     def __call__(
         self,
         images: ImageInput,
@@ -549,7 +556,7 @@ def __call__(
             if annotations is not None:
                 annotations = [annotations]
 
-        # prepare (COCO annotations as a list of Dict -> CONDITIONAL DETR target as a single Dict per image)
+        # prepare (COCO annotations as a list of Dict -> ConditionalDETR target as a single Dict per image)
         if annotations is not None:
             for idx, (image, target) in enumerate(zip(images, annotations)):
                 if not isinstance(image, Image.Image):
@@ -624,6 +631,7 @@ def __call__(
 
         return encoded_inputs
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
     def _max_by_axis(self, the_list):
         # type: (List[List[int]]) -> List[int]
         maxes = the_list[0]
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 20aa578737d70..188e1ab8f5dde 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -911,8 +911,7 @@ def forward(
         k_pos = self.sa_kpos_proj(query_position_embeddings)
         v = self.sa_v_proj(hidden_states)
 
-        bs, num_queries, n_model = q_content.shape
-        _, hw, _ = k_content.shape
+        _, num_queries, n_model = q_content.shape
 
         q = q_content + q_pos
         k = k_content + k_pos
@@ -936,8 +935,8 @@ def forward(
         k_content = self.ca_kcontent_proj(encoder_hidden_states)
         v = self.ca_v_proj(encoder_hidden_states)
 
-        bs, num_queries, n_model = q_content.shape
-        _, hw, _ = k_content.shape
+        batch_size, num_queries, n_model = q_content.shape
+        _, src_len, _ = k_content.shape
 
         k_pos = self.ca_kpos_proj(position_embeddings)
 
@@ -951,13 +950,13 @@ def forward(
             q = q_content
             k = k_content
 
-        q = q.view(bs, num_queries, self.nhead, n_model // self.nhead)
+        q = q.view(batch_size, num_queries, self.nhead, n_model // self.nhead)
         query_sine_embed = self.ca_qpos_sine_proj(query_sine_embed)
-        query_sine_embed = query_sine_embed.view(bs, num_queries, self.nhead, n_model // self.nhead)
-        q = torch.cat([q, query_sine_embed], dim=3).view(bs, num_queries, n_model * 2)
-        k = k.view(bs, hw, self.nhead, n_model // self.nhead)
-        k_pos = k_pos.view(bs, hw, self.nhead, n_model // self.nhead)
-        k = torch.cat([k, k_pos], dim=3).view(bs, hw, n_model * 2)
+        query_sine_embed = query_sine_embed.view(batch_size, num_queries, self.nhead, n_model // self.nhead)
+        q = torch.cat([q, query_sine_embed], dim=3).view(batch_size, num_queries, n_model * 2)
+        k = k.view(batch_size, src_len, self.nhead, n_model // self.nhead)
+        k_pos = k_pos.view(batch_size, src_len, self.nhead, n_model // self.nhead)
+        k = torch.cat([k, k_pos], dim=3).view(batch_size, src_len, n_model * 2)
 
         # Cross-Attention Block
         cross_attn_weights = None
@@ -1118,6 +1117,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrEncoder with Detr->ConditionalDetr,DETR->ConditionalDETR
 class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
@@ -1125,7 +1125,7 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
 
     The encoder updates the flattened feature map through multiple self-attention layers.
 
-    Small tweak for Conditional DETR:
+    Small tweak for ConditionalDETR:
 
     - position_embeddings are added to the forward pass.
 
@@ -1141,7 +1141,7 @@ def __init__(self, config: ConditionalDetrConfig):
 
         self.layers = nn.ModuleList([ConditionalDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
 
-        # in the original Conditional DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
+        # in the original ConditionalDETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1191,7 +1191,7 @@ def forward(
 
         # expand attention_mask
         if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
             attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
 
         encoder_states = () if output_hidden_states else None
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
index 371d97de0000b..1e99c513f3524 100644
--- a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -240,8 +240,8 @@ def test_equivalence_pad_and_create_pixel_mask(self):
         encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
         encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
 
-        assert torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        assert torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
+        self.assertTrue(torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4))
+        self.assertTrue(torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4))
 
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):
@@ -261,31 +261,31 @@ def test_call_pytorch_with_coco_detection_annotations(self):
         self.assertEqual(encoding["pixel_values"].shape, expected_shape)
 
         expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
 
         # verify area
         expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
-        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
         # verify boxes
         expected_boxes_shape = torch.Size([6, 4])
         self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
         expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
         # verify image_id
         expected_image_id = torch.tensor([39769])
-        assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
         # verify is_crowd
         expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
         # verify class_labels
         expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
-        assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
         # verify orig_size
         expected_orig_size = torch.tensor([480, 640])
-        assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
         # verify size
         expected_size = torch.tensor([800, 1066])
-        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
 
     @slow
     def test_call_pytorch_with_coco_panoptic_annotations(self):
@@ -308,31 +308,31 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
         self.assertEqual(encoding["pixel_values"].shape, expected_shape)
 
         expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
 
         # verify area
         expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
-        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
         # verify boxes
         expected_boxes_shape = torch.Size([6, 4])
         self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
         expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
         # verify image_id
         expected_image_id = torch.tensor([39769])
-        assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
         # verify is_crowd
         expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
         # verify class_labels
         expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
-        assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
         # verify masks
         expected_masks_sum = 822338
         self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
         # verify orig_size
         expected_orig_size = torch.tensor([480, 640])
-        assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
         # verify size
         expected_size = torch.tensor([800, 1066])
-        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index a67f0bc30638e..2c7be4d8f2b56 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -472,7 +472,7 @@ def test_inference_no_head(self):
             outputs = model(**encoding)
 
         expected_shape = torch.Size((1, 300, 256))
-        assert outputs.last_hidden_state.shape == expected_shape
+        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
         expected_slice = torch.tensor(
             [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]]
         ).to(torch_device)

From 99b7660d2393a8cc8bb907b4b1993eaef6e2a68c Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 11:41:00 -0400
Subject: [PATCH 158/233] fixed some variable names & added more fix-copies

---
 .../feature_extraction_conditional_detr.py                | 1 -
 .../test_feature_extraction_conditional_detr.py           | 8 ++++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index f625a034d294b..74cee04a72c7f 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -152,7 +152,6 @@ class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtrac
 
     model_input_names = ["pixel_values", "pixel_mask"]
 
-
     # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.__init__
     def __init__(
         self,
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
index 1e99c513f3524..ddf7e66ef72fe 100644
--- a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -240,8 +240,12 @@ def test_equivalence_pad_and_create_pixel_mask(self):
         encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
         encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
 
-        self.assertTrue(torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4))
-        self.assertTrue(torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4))
+        self.assertTrue(
+            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
+        )
+        self.assertTrue(
+            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
+        )
 
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):

From c5063f1048aca9419e0fa768ea6f1c3f6e2954e5 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 10:36:47 -0400
Subject: [PATCH 159/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 74cee04a72c7f..96b9fa69db04e 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -122,7 +122,7 @@ def id_to_rgb(id_map):
 
 class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
-    Constructs a CONDITIONAL DETR feature extractor.
+    Constructs a Conditional DETR feature extractor.
 
     This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
     should refer to this superclass for more information regarding those methods.

From 745dfaa0c712f6db79b9eb658d34b4054d944226 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 14:14:02 -0400
Subject: [PATCH 160/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index df6673e9ebfef..2890fd2be39d7 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,8 +27,8 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "microsoft/conditional_detr_resnet50": (
-        "https://huggingface.co/microsoft/conditional_detr_resnet50/resolve/main/config.json"
+    "microsoft/conditional-detr-resnet-50": (
+        "https://huggingface.co/microsoft/conditional_detr_resnet-50/resolve/main/config.json"
     ),
 }
 

From 89e167f4c832cbe2cc9073956e23b020eff301b9 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 14:18:53 -0400
Subject: [PATCH 161/233] added more copied from

---
 .../conditional_detr/modeling_conditional_detr.py     | 11 +++++++++--
 utils/check_repo.py                                   |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 188e1ab8f5dde..92044c4d30e02 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1013,8 +1013,15 @@ def forward(self, hidden_states: torch.Tensor):
         return hidden_states
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with DetrMLPPredictionHead->MLP
 class MLP(nn.Module):
-    """Very simple multi-layer perceptron (also called FFN)"""
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
 
     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
         super().__init__()
@@ -1024,7 +1031,7 @@ def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
 
     def forward(self, x):
         for i, layer in enumerate(self.layers):
-            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
         return x
 
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 8659f795602e2..988967e797d12 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -167,6 +167,7 @@
     "FlaxCLIPVisionModel",
     "FlaxWav2Vec2ForCTC",
     "DetrForSegmentation",
+    "ConditionalDetrForSegmentation",
     "DPRReader",
     "FlaubertForQuestionAnswering",
     "FlavaImageCodebook",

From ee5a80f79c95c9952540e8096057681ed30b96bc Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 14:25:39 -0400
Subject: [PATCH 162/233] fixed quality

---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 92044c4d30e02..c18a4aaee24bd 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -22,7 +22,6 @@
 
 import torch
 from torch import Tensor, nn
-from torch.nn import functional as F
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput

From 0005e5cb7f74e8f24ba17245edb538b0c2cd494d Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 14:31:16 -0400
Subject: [PATCH 163/233] changed pretrained config

---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 2890fd2be39d7..afa6426bc3738 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -28,7 +28,7 @@
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "microsoft/conditional-detr-resnet-50": (
-        "https://huggingface.co/microsoft/conditional_detr_resnet-50/resolve/main/config.json"
+        "https://huggingface.co/microsoft/conditional-detr-resnet-50/resolve/main/config.json"
     ),
 }
 

From 9231db8de535dd49206891dd4eda56a2d3ff9f72 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Wed, 21 Sep 2022 10:25:57 -0400
Subject: [PATCH 164/233] added more copied-from and fixed the issue in
 feature_extraction_auto

---
 .../models/auto/feature_extraction_auto.py    |  2 +-
 .../modeling_conditional_detr.py              | 61 +++++++++++--------
 2 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 015fd132ef0dc..cb75f439c233d 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,6 +39,7 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
+        ("conditional_detr", "ConditionalDetrFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
@@ -46,7 +47,6 @@
         ("deformable_detr", "DetrFeatureExtractor"),
         ("deit", "DeiTFeatureExtractor"),
         ("detr", "DetrFeatureExtractor"),
-        ("detr", "DetrFeatureExtractor"),
         ("donut", "DonutFeatureExtractor"),
         ("dpt", "DPTFeatureExtractor"),
         ("flava", "FlavaFeatureExtractor"),
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index c18a4aaee24bd..79199ce06e428 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -509,11 +509,12 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrAttention
 class DetrAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
 
-    Here, we add position embeddings to the queries and keys (as explained in the Conditional DETR paper).
+    Here, we add position embeddings to the queries and keys (as explained in the DETR paper).
     """
 
     def __init__(
@@ -541,8 +542,8 @@ def __init__(
         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
 
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
     def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
         return tensor if position_embeddings is None else tensor + position_embeddings
@@ -561,7 +562,7 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+        batch_size, target_len, embed_dim = hidden_states.size()
 
         # add position embeddings to the hidden states before projecting to queries and keys
         if position_embeddings is not None:
@@ -578,35 +579,36 @@ def forward(
         # get key, value proj
         if is_cross_attention:
             # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states_original), -1, bsz)
+            key_states = self._shape(self.k_proj(key_value_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(key_value_states_original), -1, batch_size)
         else:
             # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states_original), -1, bsz)
+            key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
 
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
         key_states = key_states.view(*proj_shape)
         value_states = value_states.view(*proj_shape)
 
-        src_len = key_states.size(1)
+        source_len = key_states.size(1)
 
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
 
-        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
             raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
                 f" {attn_weights.size()}"
             )
 
         if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+            if attention_mask.size() != (batch_size, 1, target_len, source_len):
                 raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+                    f" {attention_mask.size()}"
                 )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
@@ -615,8 +617,8 @@ def forward(
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
         else:
             attn_weights_reshaped = None
 
@@ -624,15 +626,15 @@ def forward(
 
         attn_output = torch.bmm(attn_probs, value_states)
 
-        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
@@ -755,6 +757,7 @@ def forward(
         return attn_output, attn_weights_reshaped
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrEncoderLayer with DetrEncoderLayer->ConditionalDetrEncoderLayer,DetrConfig->ConditionalDetrConfig
 class ConditionalDetrEncoderLayer(nn.Module):
     def __init__(self, config: ConditionalDetrConfig):
         super().__init__()
@@ -783,7 +786,8 @@ def forward(
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
             position_embeddings (`torch.FloatTensor`, *optional*): position embeddings, to be added to hidden_states.
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
@@ -1520,9 +1524,18 @@ def forward(
 
         >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
         >>> model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50")
+
+        >>> # prepare image for the model
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
+
+        >>> # forward pass
         >>> outputs = model(**inputs)
+
+        >>> # the last hidden states are the final query embeddings of the Transformer decoder
+        >>> # these are of shape (batch_size, num_queries, hidden_size)
         >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 300, 256]
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (

From 8c9ab857643fd98d2d1cf8e465c70a7d84fcd051 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Wed, 21 Sep 2022 11:55:04 -0400
Subject: [PATCH 165/233] fixed style

---
 src/transformers/__init__.py                  | 13 -------------
 src/transformers/models/auto/modeling_auto.py |  1 -
 2 files changed, 14 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 344e842114ea7..6abc53c85008e 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3540,19 +3540,6 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_timm_and_vision_objects import *
     else:
-        from .models.conditional_detr import (
-            CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ConditionalDetrForObjectDetection,
-            ConditionalDetrForSegmentation,
-            ConditionalDetrModel,
-            ConditionalDetrPreTrainedModel,
-        )
-        from .models.deformable_detr import (
-            DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            DeformableDetrForObjectDetection,
-            DeformableDetrModel,
-            DeformableDetrPreTrainedModel,
-        )
         from .models.conditional_detr import (
             CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
             ConditionalDetrForObjectDetection,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 9c1136744e665..7f6fc80802138 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -458,7 +458,6 @@
         # Model for Object Detection mapping
         ("conditional_detr", "ConditionalDetrForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
-        ("conditional_detr", "ConditionalDETRForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),

From 9c80a814908416efbf1794422535c1d46357da1a Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:47:29 -0400
Subject: [PATCH 166/233] added conditional_detr files

---
 README.md                                     |    1 +
 README_ko.md                                  |    1 +
 README_zh-hans.md                             |    1 +
 README_zh-hant.md                             |    1 +
 docs/source/en/index.mdx                      |    2 +
 docs/source/en/model_doc/conditional_detr.mdx |   53 +
 docs/source/en/serialization.mdx              |    1 +
 src/transformers/__init__.py                  |   20 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 .../models/auto/feature_extraction_auto.py    |    2 +
 src/transformers/models/auto/modeling_auto.py |    3 +
 .../models/conditional_detr/__init__.py       |   87 +
 .../configuration_conditional_detr.py         |  242 ++
 ..._original_pytorch_checkpoint_to_pytorch.py |  298 ++
 .../feature_extraction_conditional_detr.py    |  938 ++++++
 .../modeling_conditional_detr.py              | 2547 +++++++++++++++++
 .../utils/dummy_vision_objects.py             |    7 +
 tests/models/conditional_detr/__init__.py     |    0
 ...est_feature_extraction_conditional_detr.py |  338 +++
 .../test_modeling_conditional_detr.py         |  538 ++++
 21 files changed, 5084 insertions(+)
 create mode 100644 docs/source/en/model_doc/conditional_detr.mdx
 create mode 100644 src/transformers/models/conditional_detr/__init__.py
 create mode 100644 src/transformers/models/conditional_detr/configuration_conditional_detr.py
 create mode 100644 src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
 create mode 100644 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
 create mode 100644 src/transformers/models/conditional_detr/modeling_conditional_detr.py
 create mode 100644 tests/models/conditional_detr/__init__.py
 create mode 100644 tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
 create mode 100644 tests/models/conditional_detr/test_modeling_conditional_detr.py

diff --git a/README.md b/README.md
index 31e4c5af04567..a481458cde19e 100644
--- a/README.md
+++ b/README.md
@@ -278,6 +278,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_ko.md b/README_ko.md
index a4ccc124acc2f..de1f2b3e13560 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -228,6 +228,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 34839f54a3a80..f0ffb27c4101f 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -252,6 +252,7 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang发布。
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 9e6a4a0b0ecfe..03687087f7a22 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -264,6 +264,7 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index f118359bc57be..7830c57e6eabd 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -68,6 +68,7 @@ The documentation is organized into five sections:
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
@@ -215,6 +216,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|      conditional_detr       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/en/model_doc/conditional_detr.mdx b/docs/source/en/model_doc/conditional_detr.mdx
new file mode 100644
index 0000000000000..51316a24e43ee
--- /dev/null
+++ b/docs/source/en/model_doc/conditional_detr.mdx
@@ -0,0 +1,53 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Conditional DETR
+
+## Overview
+
+The conditional DETR model was proposed in [ConditionalDETR](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR.
+
+The abstract from the paper is the following:
+
+*The recently-developed DETR approach applies the transformer encoder and decoder architecture to object detection and achieves promising performance. In this paper, we handle the critical issue, slow training convergence, and present a conditional cross-attention mechanism for fast DETR training. Our approach is motivated by that the cross-attention in DETR relies highly on the content embeddings for localizing the four extremities and predicting the box, which increases the need for high-quality content embeddings and thus the training difficulty. Our approach, named conditional DETR, learns a conditional spatial query from the decoder embedding for decoder multi-head cross-attention. The benefit is that through the conditional spatial query, each cross-attention head is able to attend to a band containing a distinct region, e.g., one object extremity or a region inside the object box. This narrows down the spatial range for localizing the distinct regions for object classification and box regression, thus relaxing the dependence on the content embeddings and easing the training. Empirical results show that conditional DETR converges 6.7× faster for the backbones R50 and R101 and 10× faster for stronger backbones DC5-R50 and DC5-R101. Code is available at https://github.com/Atten4Vis/ConditionalDETR.*
+
+
+This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The original code can be found [here](https://github.com/Atten4Vis/ConditionalDETR).
+
+
+## ConditionalDETRConfig
+
+[[autodoc]] ConditionalDETRConfig
+
+## ConditionalDETRFeatureExtractor
+
+[[autodoc]] ConditionalDETRFeatureExtractor
+    - __call__
+    - pad_and_create_pixel_mask
+    - post_process
+    - post_process_segmentation
+    - post_process_panoptic
+
+## ConditionalDETRModel
+
+[[autodoc]] ConditionalDETRModel
+    - forward
+
+## ConditionalDETRForObjectDetection
+
+[[autodoc]] ConditionalDETRForObjectDetection
+    - forward
+
+## ConditionalDETRForSegmentation
+
+[[autodoc]] ConditionalDETRForSegmentation
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 74f50c78513ce..f6304004b62ae 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -57,6 +57,7 @@ Ready-made configurations include the following architectures:
 - CamemBERT
 - CLIP
 - CodeGen
+- conditional_detr
 - ConvBERT
 - ConvNeXT
 - Data2VecText
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3c3a3a5006416..b96724b992093 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -172,6 +172,7 @@
         "CLIPVisionConfig",
     ],
     "models.codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenTokenizer"],
+    "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDETRConfig"],
     "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
     "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
     "models.cpm": [],
@@ -660,6 +661,7 @@
     _import_structure["models.convnext"].append("ConvNextFeatureExtractor")
     _import_structure["models.deit"].append("DeiTFeatureExtractor")
     _import_structure["models.detr"].append("DetrFeatureExtractor")
+    _import_structure["models.conditional_detr"].append("ConditionalDETRFeatureExtractor")
     _import_structure["models.donut"].append("DonutFeatureExtractor")
     _import_structure["models.dpt"].append("DPTFeatureExtractor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaProcessor"])
@@ -708,6 +710,15 @@
             "DetrPreTrainedModel",
         ]
     )
+    _import_structure["models.conditional_detr"].extend(
+        [
+            "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ConditionalDETRForObjectDetection",
+            "ConditionalDETRForSegmentation",
+            "ConditionalDETRModel",
+            "ConditionalDETRPreTrainedModel",
+        ]
+    )
 
 try:
     if not is_scatter_available():
@@ -3075,6 +3086,7 @@
         CLIPVisionConfig,
     )
     from .models.codegen import CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP, CodeGenConfig, CodeGenTokenizer
+    from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDETRConfig
     from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
     from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
@@ -3498,6 +3510,7 @@
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
         from .models.clip import CLIPFeatureExtractor
+        from .models.conditional_detr import ConditionalDETRFeatureExtractor
         from .models.convnext import ConvNextFeatureExtractor
         from .models.deit import DeiTFeatureExtractor
         from .models.detr import DetrFeatureExtractor
@@ -3527,6 +3540,13 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_timm_and_vision_objects import *
     else:
+        from .models.conditional_detr import (
+            CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ConditionalDETRForObjectDetection,
+            ConditionalDETRForSegmentation,
+            ConditionalDETRModel,
+            ConditionalDETRPreTrainedModel,
+        )
         from .models.deformable_detr import (
             DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
             DeformableDetrForObjectDetection,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index fbdbfd579cb9e..8a6622a9f35b6 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -38,6 +38,7 @@
     canine,
     clip,
     codegen,
+    conditional_detr,
     convbert,
     convnext,
     cpm,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 1204e6608a768..b28bb0665bc3d 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -43,6 +43,7 @@
         ("canine", "CanineConfig"),
         ("clip", "CLIPConfig"),
         ("codegen", "CodeGenConfig"),
+        ("conditional_detr", "ConditionalDETRConfig"),
         ("convbert", "ConvBertConfig"),
         ("convnext", "ConvNextConfig"),
         ("ctrl", "CTRLConfig"),
@@ -175,6 +176,7 @@
         ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("clip", "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("codegen", "CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("conditional_detr", "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convbert", "CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convnext", "CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ctrl", "CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -300,6 +302,7 @@
         ("canine", "CANINE"),
         ("clip", "CLIP"),
         ("codegen", "CodeGen"),
+        ("conditional_detr", "conditional_detr"),
         ("convbert", "ConvBERT"),
         ("convnext", "ConvNeXT"),
         ("cpm", "CPM"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 015fd132ef0dc..9ba8295b406da 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,6 +39,8 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
+        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
+        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 7f4968d03cdf6..322a48f1cefeb 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -42,6 +42,7 @@
         ("canine", "CanineModel"),
         ("clip", "CLIPModel"),
         ("codegen", "CodeGenModel"),
+        ("conditional_detr", "ConditionalDETRModel"),
         ("convbert", "ConvBertModel"),
         ("convnext", "ConvNextModel"),
         ("ctrl", "CTRLModel"),
@@ -373,6 +374,7 @@
     [
         # Do not add new models here, this class will be deprecated in the future.
         # Model for Image Segmentation mapping
+        ("conditional_detr", "ConditionalDETRForSegmentation"),
         ("detr", "DetrForSegmentation"),
     ]
 )
@@ -455,6 +457,7 @@
 MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Object Detection mapping
+        ("conditional_detr", "ConditionalDETRForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
diff --git a/src/transformers/models/conditional_detr/__init__.py b/src/transformers/models/conditional_detr/__init__.py
new file mode 100644
index 0000000000000..1b72e7d7a3238
--- /dev/null
+++ b/src/transformers/models/conditional_detr/__init__.py
@@ -0,0 +1,87 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_timm_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_conditional_detr": [
+        "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "ConditionalDETRConfig",
+        "ConditionalDETROnnxConfig",
+    ]
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["feature_extraction_conditional_detr"] = ["ConditionalDETRFeatureExtractor"]
+
+try:
+    if not is_timm_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_conditional_detr"] = [
+        "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "ConditionalDETRForObjectDetection",
+        "ConditionalDETRForSegmentation",
+        "ConditionalDETRModel",
+        "ConditionalDETRPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_conditional_detr import (
+        CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ConditionalDETRConfig,
+        ConditionalDETROnnxConfig,
+    )
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_conditional_detr import ConditionalDETRFeatureExtractor
+
+    try:
+        if not is_timm_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_conditional_detr import (
+            CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ConditionalDETRForObjectDetection,
+            ConditionalDETRForSegmentation,
+            ConditionalDETRModel,
+            ConditionalDETRPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
new file mode 100644
index 0000000000000..ce004dc6fc15c
--- /dev/null
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -0,0 +1,242 @@
+# coding=utf-8
+# Copyright 2022 Facebook AI Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CONDITIONAL_DETR model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "DeppMeng/ConditionalDETR": "https://huggingface.co/DeppMeng/ConditionalDETR/resolve/main/config.json",
+}
+
+
+class ConditionalDETRConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
+    a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
+    [DeppMeng/ConditionalDETR](https://huggingface.co/DeppMeng/ConditionalDETR) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_queries (`int`, *optional*, defaults to 100):
+            Number of object queries, i.e. detection slots. This is the maximal number of objects
+            [`ConditionalDETRModel`] can detect in a single image. For COCO, we recommend 100 queries.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers.
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        init_xavier_std (`float`, *optional*, defaults to 1):
+            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
+        backbone (`str`, *optional*, defaults to `"resnet50"`):
+            Name of convolutional backbone to use. Supports any convolutional backbone from the timm package. For a
+            list of all available models, see [this
+            page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
+        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
+            Whether to use pretrained weights for the backbone.
+        dilation (`bool`, *optional*, defaults to `False`):
+            Whether to replace stride with dilation in the last convolutional block (DC5).
+        class_cost (`float`, *optional*, defaults to 1):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        mask_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the Focal loss in the panoptic segmentation loss.
+        dice_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.1):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+
+    Examples:
+
+    ```python
+    >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
+
+    >>> # Initializing a CONDITIONAL_DETR DeppMeng/ConditionalDETR style configuration
+    >>> configuration = ConditionalDETRConfig()
+
+    >>> # Initializing a model from the DeppMeng/ConditionalDETR style configuration
+    >>> model = ConditionalDETRModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "conditional_detr"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        num_queries=300,
+        max_position_embeddings=1024,
+        encoder_layers=6,
+        encoder_ffn_dim=2048,
+        encoder_attention_heads=8,
+        decoder_layers=6,
+        decoder_ffn_dim=2048,
+        decoder_attention_heads=8,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        init_xavier_std=1.0,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        auxiliary_loss=False,
+        position_embedding_type="sine",
+        backbone="resnet50",
+        dilation=False,
+        class_cost=2,
+        bbox_cost=5,
+        giou_cost=2,
+        mask_loss_coefficient=1,
+        dice_loss_coefficient=1,
+        cls_loss_coefficient=2,
+        bbox_loss_coefficient=5,
+        giou_loss_coefficient=2,
+        focal_alpha=0.25,
+        **kwargs
+    ):
+        self.num_queries = num_queries
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.auxiliary_loss = auxiliary_loss
+        self.position_embedding_type = position_embedding_type
+        self.backbone = backbone
+        self.dilation = dilation
+        # Hungarian matcher
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        # Loss coefficients
+        self.mask_loss_coefficient = mask_loss_coefficient
+        self.dice_loss_coefficient = dice_loss_coefficient
+        self.cls_loss_coefficient = cls_loss_coefficient
+        self.bbox_loss_coefficient = bbox_loss_coefficient
+        self.giou_loss_coefficient = giou_loss_coefficient
+        self.focal_alpha = focal_alpha
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+
+class ConditionalDETROnnxConfig(OnnxConfig):
+
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("pixel_mask", {0: "batch"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-5
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 12
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000..2d7abc3c088b5
--- /dev/null
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,298 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert CONDITIONAL_DETR checkpoints."""
+
+
+import argparse
+import json
+from collections import OrderedDict
+from pathlib import Path
+
+import torch
+from PIL import Image
+
+import requests
+from huggingface_hub import hf_hub_download
+from transformers import (
+    ConditionalDETRConfig,
+    ConditionalDETRFeatureExtractor,
+    ConditionalDETRForObjectDetection,
+    ConditionalDETRForSegmentation,
+)
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+rename_keys = []
+for i in range(6):
+    # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
+    )
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
+    )
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
+    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
+    )
+    rename_keys.append(
+        (
+            f"transformer.decoder.layers.{i}.cross_attn.out_proj.weight",
+            f"decoder.layers.{i}.encoder_attn.out_proj.weight",
+        )
+    )
+    rename_keys.append(
+        (
+            f"transformer.decoder.layers.{i}.cross_attn.out_proj.bias",
+            f"decoder.layers.{i}.encoder_attn.out_proj.bias",
+        )
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
+
+    # q, k, v projections in self/cross-attention in decoder for conditional DETR
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.weight", f"decoder.layers.{i}.sa_v_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight"))
+    # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.weight", f"decoder.layers.{i}.ca_qpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.weight", f"decoder.layers.{i}.ca_v_proj.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight"))
+
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.bias", f"decoder.layers.{i}.sa_qpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.bias", f"decoder.layers.{i}.sa_kpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.bias", f"decoder.layers.{i}.sa_v_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias"))
+    # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.bias", f"decoder.layers.{i}.ca_qpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.bias", f"decoder.layers.{i}.ca_kpos_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.bias", f"decoder.layers.{i}.ca_v_proj.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias"))
+
+# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
+# for conditional DETR, also convert reference point head and query scale MLP
+rename_keys.extend(
+    [
+        ("input_proj.weight", "input_projection.weight"),
+        ("input_proj.bias", "input_projection.bias"),
+        ("query_embed.weight", "query_position_embeddings.weight"),
+        ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
+        ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
+        ("class_embed.weight", "class_labels_classifier.weight"),
+        ("class_embed.bias", "class_labels_classifier.bias"),
+        ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
+        ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
+        ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
+        ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
+        ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
+        ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
+        ("transformer.decoder.ref_point_head.layers.0.weight", "decoder.ref_point_head.layers.0.weight"),
+        ("transformer.decoder.ref_point_head.layers.0.bias", "decoder.ref_point_head.layers.0.bias"),
+        ("transformer.decoder.ref_point_head.layers.1.weight", "decoder.ref_point_head.layers.1.weight"),
+        ("transformer.decoder.ref_point_head.layers.1.bias", "decoder.ref_point_head.layers.1.bias"),
+        ("transformer.decoder.query_scale.layers.0.weight", "decoder.query_scale.layers.0.weight"),
+        ("transformer.decoder.query_scale.layers.0.bias", "decoder.query_scale.layers.0.bias"),
+        ("transformer.decoder.query_scale.layers.1.weight", "decoder.query_scale.layers.1.weight"),
+        ("transformer.decoder.query_scale.layers.1.bias", "decoder.query_scale.layers.1.bias"),
+        ("transformer.decoder.layers.0.ca_qpos_proj.weight", "decoder.layers.0.ca_qpos_proj.weight"),
+        ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias")
+    ]
+)
+
+
+def rename_key(state_dict, old, new):
+    val = state_dict.pop(old)
+    state_dict[new] = val
+
+
+def rename_backbone_keys(state_dict):
+    new_state_dict = OrderedDict()
+    for key, value in state_dict.items():
+        if "backbone.0.body" in key:
+            new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
+            new_state_dict[new_key] = value
+        else:
+            new_state_dict[key] = value
+
+    return new_state_dict
+
+
+def read_in_q_k_v(state_dict, is_panoptic=False):
+    prefix = ""
+    if is_panoptic:
+        prefix = "conditional_detr."
+
+    # first: transformer encoder
+    for i in range(6):
+        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+
+    return im
+
+
+@torch.no_grad()
+def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our CONDITIONAL_DETR structure.
+    """
+
+    # load default config
+    config = ConditionalDETRConfig()
+    # set backbone and dilation attributes
+    if "resnet101" in model_name:
+        config.backbone = "resnet101"
+    if "dc5" in model_name:
+        config.dilation = True
+    is_panoptic = "panoptic" in model_name
+    if is_panoptic:
+        config.num_labels = 250
+    else:
+        config.num_labels = 91
+        repo_id = "datasets/huggingface/label-files"
+        filename = "coco-detection-id2label.json"
+        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
+        id2label = {int(k): v for k, v in id2label.items()}
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
+
+    # load feature extractor
+    format = "coco_panoptic" if is_panoptic else "coco_detection"
+    feature_extractor = ConditionalDETRFeatureExtractor(format=format)
+
+    # prepare image
+    img = prepare_img()
+    encoding = feature_extractor(images=img, return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+
+    logger.info(f"Converting model {model_name}...")
+
+    # load original model from torch hub
+    conditional_detr = torch.hub.load("DeppMeng/ConditionalDETR", model_name, pretrained=True).eval()
+    state_dict = conditional_detr.state_dict()
+    # rename keys
+    for src, dest in rename_keys:
+        if is_panoptic:
+            src = "conditional_detr." + src
+        rename_key(state_dict, src, dest)
+    state_dict = rename_backbone_keys(state_dict)
+    # query, key and value matrices need special treatment
+    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
+    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
+    prefix = "conditional_detr.model." if is_panoptic else "model."
+    for key in state_dict.copy().keys():
+        if is_panoptic:
+            if (
+                key.startswith("conditional_detr")
+                and not key.startswith("class_labels_classifier")
+                and not key.startswith("bbox_predictor")
+            ):
+                val = state_dict.pop(key)
+                state_dict["conditional_detr.model" + key[4:]] = val
+            elif "class_labels_classifier" in key or "bbox_predictor" in key:
+                val = state_dict.pop(key)
+                state_dict["conditional_detr." + key] = val
+            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
+                continue
+            else:
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+        else:
+            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+    # finally, create HuggingFace model and load state dict
+    model = ConditionalDETRForSegmentation(config) if is_panoptic else ConditionalDETRForObjectDetection(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+    # verify our conversion
+    original_outputs = conditional_detr(pixel_values)
+    outputs = model(pixel_values)
+    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4)
+    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4)
+    if is_panoptic:
+        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
+
+    # Save model and feature extractor
+    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    model.save_pretrained(pytorch_dump_folder_path)
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_name",
+        default="conditional_detr_resnet50",
+        type=str,
+        help="Name of the CONDITIONAL_DETR model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_conditional_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
new file mode 100644
index 0000000000000..07550124c157e
--- /dev/null
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -0,0 +1,938 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for CONDITIONAL_DETR."""
+
+import io
+import pathlib
+from collections import defaultdict
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+from PIL import Image
+
+from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor
+from ...utils import TensorType, is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+logger = logging.get_logger(__name__)
+
+
+ImageInput = Union[Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"]]
+
+
+# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+def center_to_corners_format(x):
+    """
+    Converts a PyTorch tensor of bounding boxes of center format (center_x, center_y, width, height) to corners format
+    (x_0, y_0, x_1, y_1).
+    """
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def corners_to_center_format(x):
+    """
+    Converts a NumPy array of bounding boxes of shape (number of bounding boxes, 4) of corners format (x_0, y_0, x_1,
+    y_1) to center format (center_x, center_y, width, height).
+    """
+    x_transposed = x.T
+    x0, y0, x1, y1 = x_transposed[0], x_transposed[1], x_transposed[2], x_transposed[3]
+    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
+    return np.stack(b, axis=-1)
+
+
+def masks_to_boxes(masks):
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+
+    Returns a [N, 4] tensor, with the boxes in corner (xyxy) format.
+    """
+    if masks.size == 0:
+        return np.zeros((0, 4))
+
+    h, w = masks.shape[-2:]
+
+    y = np.arange(0, h, dtype=np.float32)
+    x = np.arange(0, w, dtype=np.float32)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = np.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * np.expand_dims(x, axis=0)
+    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
+    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
+    x_min = x.filled(fill_value=1e8)
+    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
+
+    y_mask = masks * np.expand_dims(y, axis=0)
+    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
+    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
+    y_min = y.filled(fill_value=1e8)
+    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
+
+    return np.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
+def rgb_to_id(color):
+    if isinstance(color, np.ndarray) and len(color.shape) == 3:
+        if color.dtype == np.uint8:
+            color = color.astype(np.int32)
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+    return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+
+def id_to_rgb(id_map):
+    if isinstance(id_map, np.ndarray):
+        id_map_copy = id_map.copy()
+        rgb_shape = tuple(list(id_map.shape) + [3])
+        rgb_map = np.zeros(rgb_shape, dtype=np.uint8)
+        for i in range(3):
+            rgb_map[..., i] = id_map_copy % 256
+            id_map_copy //= 256
+        return rgb_map
+    color = []
+    for _ in range(3):
+        color.append(id_map % 256)
+        id_map //= 256
+    return color
+
+
+class ConditionalDETRFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+    r"""
+    Constructs a CONDITIONAL_DETR feature extractor.
+
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+
+    Args:
+        format (`str`, *optional*, defaults to `"coco_detection"`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int`, *optional*, defaults to 800):
+            Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a
+            sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
+            the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
+            height / width, size)`.
+        max_size (`int`, *optional*, defaults to `1333`):
+            The largest size an image dimension can have (otherwise it's capped). Only has an effect if `do_resize` is
+            set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
+            The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
+        image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
+            ImageNet std.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        format="coco_detection",
+        do_resize=True,
+        size=800,
+        max_size=1333,
+        do_normalize=True,
+        image_mean=None,
+        image_std=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.format = self._is_valid_format(format)
+        self.do_resize = do_resize
+        self.size = size
+        self.max_size = max_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.485, 0.456, 0.406]  # ImageNet mean
+        self.image_std = image_std if image_std is not None else [0.229, 0.224, 0.225]  # ImageNet std
+
+    def _is_valid_format(self, format):
+        if format not in ["coco_detection", "coco_panoptic"]:
+            raise ValueError(f"Format {format} not supported")
+        return format
+
+    def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
+        if self.format == "coco_detection":
+            image, target = self.prepare_coco_detection(image, target, return_segmentation_masks)
+            return image, target
+        elif self.format == "coco_panoptic":
+            image, target = self.prepare_coco_panoptic(image, target, masks_path)
+            return image, target
+        else:
+            raise ValueError(f"Format {self.format} not supported")
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33
+    def convert_coco_poly_to_mask(self, segmentations, height, width):
+
+        try:
+            from pycocotools import mask as coco_mask
+        except ImportError:
+            raise ImportError("Pycocotools is not installed in your environment.")
+
+        masks = []
+        for polygons in segmentations:
+            rles = coco_mask.frPyObjects(polygons, height, width)
+            mask = coco_mask.decode(rles)
+            if len(mask.shape) < 3:
+                mask = mask[..., None]
+            mask = np.asarray(mask, dtype=np.uint8)
+            mask = np.any(mask, axis=2)
+            masks.append(mask)
+        if masks:
+            masks = np.stack(masks, axis=0)
+        else:
+            masks = np.zeros((0, height, width), dtype=np.uint8)
+
+        return masks
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50
+    def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
+        """
+        Convert the target in COCO format into the format expected by CONDITIONAL_DETR.
+        """
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = np.asarray([image_id], dtype=np.int64)
+
+        # get all COCO annotations for the given image
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=w)
+        boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = np.asarray(classes, dtype=np.int64)
+
+        if return_segmentation_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = self.convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = np.asarray(keypoints, dtype=np.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.reshape((-1, 3))
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if return_segmentation_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["class_labels"] = classes
+        if return_segmentation_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = np.asarray([obj["area"] for obj in anno], dtype=np.float32)
+        iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno], dtype=np.int64)
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+
+        target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+        target["size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+
+        return image, target
+
+    def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
+        w, h = image.size
+        ann_info = target.copy()
+        ann_path = pathlib.Path(masks_path) / ann_info["file_name"]
+
+        if "segments_info" in ann_info:
+            masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
+            masks = rgb_to_id(masks)
+
+            ids = np.array([ann["id"] for ann in ann_info["segments_info"]])
+            masks = masks == ids[:, None, None]
+            masks = np.asarray(masks, dtype=np.uint8)
+
+            labels = np.asarray([ann["category_id"] for ann in ann_info["segments_info"]], dtype=np.int64)
+
+        target = {}
+        target["image_id"] = np.asarray(
+            [ann_info["image_id"] if "image_id" in ann_info else ann_info["id"]], dtype=np.int64
+        )
+        if return_masks:
+            target["masks"] = masks
+        target["class_labels"] = labels
+
+        target["boxes"] = masks_to_boxes(masks)
+
+        target["size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+        target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64)
+        if "segments_info" in ann_info:
+            target["iscrowd"] = np.asarray([ann["iscrowd"] for ann in ann_info["segments_info"]], dtype=np.int64)
+            target["area"] = np.asarray([ann["area"] for ann in ann_info["segments_info"]], dtype=np.float32)
+
+        return image, target
+
+    def _resize(self, image, size, target=None, max_size=None):
+        """
+        Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
+        edge of the image will be matched to this number.
+
+        If given, also resize the target accordingly.
+        """
+        if not isinstance(image, Image.Image):
+            image = self.to_pil_image(image)
+
+        def get_size_with_aspect_ratio(image_size, size, max_size=None):
+            w, h = image_size
+            if max_size is not None:
+                min_original_size = float(min((w, h)))
+                max_original_size = float(max((w, h)))
+                if max_original_size / min_original_size * size > max_size:
+                    size = int(round(max_size * min_original_size / max_original_size))
+
+            if (w <= h and w == size) or (h <= w and h == size):
+                return (h, w)
+
+            if w < h:
+                ow = size
+                oh = int(size * h / w)
+            else:
+                oh = size
+                ow = int(size * w / h)
+
+            return (oh, ow)
+
+        def get_size(image_size, size, max_size=None):
+            if isinstance(size, (list, tuple)):
+                return size
+            else:
+                # size returned must be (w, h) since we use PIL to resize images
+                # so we revert the tuple
+                return get_size_with_aspect_ratio(image_size, size, max_size)[::-1]
+
+        size = get_size(image.size, size, max_size)
+        rescaled_image = self.resize(image, size=size)
+
+        if target is None:
+            return rescaled_image, None
+
+        ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+        ratio_width, ratio_height = ratios
+
+        target = target.copy()
+        if "boxes" in target:
+            boxes = target["boxes"]
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            target["boxes"] = scaled_boxes
+
+        if "area" in target:
+            area = target["area"]
+            scaled_area = area * (ratio_width * ratio_height)
+            target["area"] = scaled_area
+
+        w, h = size
+        target["size"] = np.asarray([h, w], dtype=np.int64)
+
+        if "masks" in target:
+            # use PyTorch as current workaround
+            # TODO replace by self.resize
+            masks = torch.from_numpy(target["masks"][:, None]).float()
+            interpolated_masks = nn.functional.interpolate(masks, size=(h, w), mode="nearest")[:, 0] > 0.5
+            target["masks"] = interpolated_masks.numpy()
+
+        return rescaled_image, target
+
+    def _normalize(self, image, mean, std, target=None):
+        """
+        Normalize the image with a certain mean and std.
+
+        If given, also normalize the target bounding boxes based on the size of the image.
+        """
+
+        image = self.normalize(image, mean=mean, std=std)
+        if target is None:
+            return image, None
+
+        target = target.copy()
+        h, w = image.shape[-2:]
+
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = corners_to_center_format(boxes)
+            boxes = boxes / np.asarray([w, h, w, h], dtype=np.float32)
+            target["boxes"] = boxes
+
+        return image, target
+
+    def __call__(
+        self,
+        images: ImageInput,
+        annotations: Union[List[Dict], List[List[Dict]]] = None,
+        return_segmentation_masks: Optional[bool] = False,
+        masks_path: Optional[pathlib.Path] = None,
+        pad_and_return_pixel_mask: Optional[bool] = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several image(s) and optional annotations. Images are by default
+        padded up to the largest image in a batch, and a pixel mask is created that indicates which pixels are
+        real/which are padding.
+
+        <Tip warning={true}>
+
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            annotations (`Dict`, `List[Dict]`, *optional*):
+                The corresponding annotations in COCO format.
+
+                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_detection"`, the
+                annotations for each image should have the following format: {'image_id': int, 'annotations':
+                [annotation]}, with the annotations being a list of COCO object annotations.
+
+                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the
+                annotations for each image should have the following format: {'image_id': int, 'file_name': str,
+                'segments_info': [segment_info]} with segments_info being a list of COCO panoptic annotations.
+
+            return_segmentation_masks (`Dict`, `List[Dict]`, *optional*, defaults to `False`):
+                Whether to also include instance segmentation masks as part of the labels in case `format =
+                "coco_detection"`.
+
+            masks_path (`pathlib.Path`, *optional*):
+                Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only
+                relevant in case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
+
+            pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether or not to pad images up to the largest image in a batch and create a pixel mask.
+
+                If left to the default, will return a pixel mask that is:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
+                objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model.
+            - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
+              *"pixel_mask"* is in `self.model_input_names`).
+            - **labels** -- Optional labels to be fed to a model (when `annotations` are provided)
+        """
+        # Input type checking for clearer error
+
+        valid_images = False
+        valid_annotations = False
+        valid_masks_path = False
+
+        # Check that images has a valid type
+        if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
+            valid_images = True
+        elif isinstance(images, (list, tuple)):
+            if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
+                valid_images = True
+
+        if not valid_images:
+            raise ValueError(
+                "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
+                "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
+            )
+
+        is_batched = bool(
+            isinstance(images, (list, tuple))
+            and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
+        )
+
+        # Check that annotations has a valid type
+        if annotations is not None:
+            if not is_batched:
+                if self.format == "coco_detection":
+                    if isinstance(annotations, dict) and "image_id" in annotations and "annotations" in annotations:
+                        if isinstance(annotations["annotations"], (list, tuple)):
+                            # an image can have no annotations
+                            if len(annotations["annotations"]) == 0 or isinstance(annotations["annotations"][0], dict):
+                                valid_annotations = True
+                elif self.format == "coco_panoptic":
+                    if isinstance(annotations, dict) and "image_id" in annotations and "segments_info" in annotations:
+                        if isinstance(annotations["segments_info"], (list, tuple)):
+                            # an image can have no segments (?)
+                            if len(annotations["segments_info"]) == 0 or isinstance(
+                                annotations["segments_info"][0], dict
+                            ):
+                                valid_annotations = True
+            else:
+                if isinstance(annotations, (list, tuple)):
+                    if len(images) != len(annotations):
+                        raise ValueError("There must be as many annotations as there are images")
+                    if isinstance(annotations[0], Dict):
+                        if self.format == "coco_detection":
+                            if isinstance(annotations[0]["annotations"], (list, tuple)):
+                                valid_annotations = True
+                        elif self.format == "coco_panoptic":
+                            if isinstance(annotations[0]["segments_info"], (list, tuple)):
+                                valid_annotations = True
+
+            if not valid_annotations:
+                raise ValueError(
+                    """
+                    Annotations must of type `Dict` (single image) or `List[Dict]` (batch of images). In case of object
+                    detection, each dictionary should contain the keys 'image_id' and 'annotations', with the latter
+                    being a list of annotations in COCO format. In case of panoptic segmentation, each dictionary
+                    should contain the keys 'file_name', 'image_id' and 'segments_info', with the latter being a list
+                    of annotations in COCO format.
+                    """
+                )
+
+        # Check that masks_path has a valid type
+        if masks_path is not None:
+            if self.format == "coco_panoptic":
+                if isinstance(masks_path, pathlib.Path):
+                    valid_masks_path = True
+                if not valid_masks_path:
+                    raise ValueError(
+                        "The path to the directory containing the mask PNG files should be provided as a"
+                        " `pathlib.Path` object."
+                    )
+
+        if not is_batched:
+            images = [images]
+            if annotations is not None:
+                annotations = [annotations]
+
+        # prepare (COCO annotations as a list of Dict -> CONDITIONAL_DETR target as a single Dict per image)
+        if annotations is not None:
+            for idx, (image, target) in enumerate(zip(images, annotations)):
+                if not isinstance(image, Image.Image):
+                    image = self.to_pil_image(image)
+                image, target = self.prepare(image, target, return_segmentation_masks, masks_path)
+                images[idx] = image
+                annotations[idx] = target
+
+        # transformations (resizing + normalization)
+        if self.do_resize and self.size is not None:
+            if annotations is not None:
+                for idx, (image, target) in enumerate(zip(images, annotations)):
+                    image, target = self._resize(image=image, target=target, size=self.size, max_size=self.max_size)
+                    images[idx] = image
+                    annotations[idx] = target
+            else:
+                for idx, image in enumerate(images):
+                    images[idx] = self._resize(image=image, target=None, size=self.size, max_size=self.max_size)[0]
+
+        if self.do_normalize:
+            if annotations is not None:
+                for idx, (image, target) in enumerate(zip(images, annotations)):
+                    image, target = self._normalize(
+                        image=image, mean=self.image_mean, std=self.image_std, target=target
+                    )
+                    images[idx] = image
+                    annotations[idx] = target
+            else:
+                images = [
+                    self._normalize(image=image, mean=self.image_mean, std=self.image_std)[0] for image in images
+                ]
+
+        if pad_and_return_pixel_mask:
+            # pad images up to largest image in batch and create pixel_mask
+            max_size = self._max_by_axis([list(image.shape) for image in images])
+            c, h, w = max_size
+            padded_images = []
+            pixel_mask = []
+            for image in images:
+                # create padded image
+                padded_image = np.zeros((c, h, w), dtype=np.float32)
+                padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
+                padded_images.append(padded_image)
+                # create pixel mask
+                mask = np.zeros((h, w), dtype=np.int64)
+                mask[: image.shape[1], : image.shape[2]] = True
+                pixel_mask.append(mask)
+            images = padded_images
+
+        # return as BatchFeature
+        data = {}
+        data["pixel_values"] = images
+        if pad_and_return_pixel_mask:
+            data["pixel_mask"] = pixel_mask
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        if annotations is not None:
+            # Convert to TensorType
+            tensor_type = return_tensors
+            if not isinstance(tensor_type, TensorType):
+                tensor_type = TensorType(tensor_type)
+
+            if not tensor_type == TensorType.PYTORCH:
+                raise ValueError("Only PyTorch is supported for the moment.")
+            else:
+                if not is_torch_available():
+                    raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
+
+                encoded_inputs["labels"] = [
+                    {k: torch.from_numpy(v) for k, v in target.items()} for target in annotations
+                ]
+
+        return encoded_inputs
+
+    def _max_by_axis(self, the_list):
+        # type: (List[List[int]]) -> List[int]
+        maxes = the_list[0]
+        for sublist in the_list[1:]:
+            for index, item in enumerate(sublist):
+                maxes[index] = max(maxes[index], item)
+        return maxes
+
+    def pad_and_create_pixel_mask(
+        self, pixel_values_list: List["torch.Tensor"], return_tensors: Optional[Union[str, TensorType]] = None
+    ):
+        """
+        Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
+
+        Args:
+            pixel_values_list (`List[torch.Tensor]`):
+                List of images (pixel values) to be padded. Each image should be a tensor of shape (C, H, W).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
+                objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model.
+            - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
+              *"pixel_mask"* is in `self.model_input_names`).
+
+        """
+
+        max_size = self._max_by_axis([list(image.shape) for image in pixel_values_list])
+        c, h, w = max_size
+        padded_images = []
+        pixel_mask = []
+        for image in pixel_values_list:
+            # create padded image
+            padded_image = np.zeros((c, h, w), dtype=np.float32)
+            padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
+            padded_images.append(padded_image)
+            # create pixel mask
+            mask = np.zeros((h, w), dtype=np.int64)
+            mask[: image.shape[1], : image.shape[2]] = True
+            pixel_mask.append(mask)
+
+        # return as BatchFeature
+        data = {"pixel_values": padded_images, "pixel_mask": pixel_mask}
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        return encoded_inputs
+
+    # POSTPROCESSING METHODS
+    # inspired by https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py#L258
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the output of [`ConditionalDETRForObjectDetection`] into the format expected by the COCO api. Only
+        supports PyTorch.
+
+        Args:
+            outputs ([`ConditionalDETRObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation). For visualization, this should be the image size after data
+                augment, but before padding.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        scores = topk_values
+        topk_boxes = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+    def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
+        """
+        Converts the output of [`ConditionalDETRForSegmentation`] into image segmentation predictions. Only supports
+        PyTorch.
+
+        Parameters:
+            outputs ([`ConditionalDETRSegmentationOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
+            threshold (`float`, *optional*, defaults to 0.9):
+                Threshold to use to filter out queries.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, raw_masks = outputs.logits, outputs.pred_masks
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.cpu().tolist())
+
+        for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
+            # we filter empty queries and detection below threshold
+            scores, labels = cur_logits.softmax(-1).max(-1)
+            keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
+            cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
+            cur_scores = cur_scores[keep]
+            cur_classes = cur_classes[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1
+
+            predictions = {"scores": cur_scores, "labels": cur_classes, "masks": cur_masks}
+            preds.append(predictions)
+        return preds
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
+    def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
+        """
+        Converts the output of [`ConditionalDETRForSegmentation`] into actual instance segmentation predictions. Only
+        supports PyTorch.
+
+        Args:
+            results (`List[Dict]`):
+                Results list obtained by [`~ConditionalDETRFeatureExtractor.post_process`], to which "masks" results
+                will be added.
+            outputs ([`ConditionalDETRSegmentationOutput`]):
+                Raw outputs of the model.
+            orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation).
+            max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation).
+            threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an
+            image in the batch as predicted by the model.
+        """
+
+        if len(orig_target_sizes) != len(max_target_sizes):
+            raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes")
+        max_h, max_w = max_target_sizes.max(0)[0].tolist()
+        outputs_masks = outputs.pred_masks.squeeze(2)
+        outputs_masks = nn.functional.interpolate(
+            outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
+        )
+        outputs_masks = (outputs_masks.sigmoid() > threshold).cpu()
+
+        for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
+            img_h, img_w = t[0], t[1]
+            results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
+            results[i]["masks"] = nn.functional.interpolate(
+                results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
+            ).byte()
+
+        return results
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
+    def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
+        """
+        Converts the output of [`ConditionalDETRForSegmentation`] into actual panoptic predictions. Only supports
+        PyTorch.
+
+        Parameters:
+            outputs ([`ConditionalDETRSegmentationOutput`]):
+                Raw outputs of the model.
+            processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
+                augmentation but before batching.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`, *optional*):
+                Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. If left to
+                None, it will default to the `processed_sizes`.
+            is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
+                Dictionary mapping class indices to either True or False, depending on whether or not they are a thing.
+                If not set, defaults to the `is_thing_map` of COCO panoptic.
+            threshold (`float`, *optional*, defaults to 0.85):
+                Threshold to use to filter out queries.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for
+            an image in the batch as predicted by the model.
+        """
+        if target_sizes is None:
+            target_sizes = processed_sizes
+        if len(processed_sizes) != len(target_sizes):
+            raise ValueError("Make sure to pass in as many processed_sizes as target_sizes")
+
+        if is_thing_map is None:
+            # default to is_thing_map of COCO panoptic
+            is_thing_map = {i: i <= 90 for i in range(201)}
+
+        out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes
+        if not len(out_logits) == len(raw_masks) == len(target_sizes):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks"
+            )
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.cpu().tolist())
+
+        for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
+            out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
+        ):
+            # we filter empty queries and detection below threshold
+            scores, labels = cur_logits.softmax(-1).max(-1)
+            keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
+            cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
+            cur_scores = cur_scores[keep]
+            cur_classes = cur_classes[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_boxes = center_to_corners_format(cur_boxes[keep])
+
+            h, w = cur_masks.shape[-2:]
+            if len(cur_boxes) != len(cur_classes):
+                raise ValueError("Not as many boxes as there are classes")
+
+            # It may be that we have several predicted masks for the same stuff class.
+            # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+            cur_masks = cur_masks.flatten(1)
+            stuff_equiv_classes = defaultdict(lambda: [])
+            for k, label in enumerate(cur_classes):
+                if not is_thing_map[label.item()]:
+                    stuff_equiv_classes[label.item()].append(k)
+
+            def get_ids_area(masks, scores, dedup=False):
+                # This helper function creates the final panoptic segmentation image
+                # It also returns the area of the masks that appears on the image
+
+                m_id = masks.transpose(0, 1).softmax(-1)
+
+                if m_id.shape[-1] == 0:
+                    # We didn't detect any mask :(
+                    m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
+                else:
+                    m_id = m_id.argmax(-1).view(h, w)
+
+                if dedup:
+                    # Merge the masks corresponding to the same stuff class
+                    for equiv in stuff_equiv_classes.values():
+                        if len(equiv) > 1:
+                            for eq_id in equiv:
+                                m_id.masked_fill_(m_id.eq(eq_id), equiv[0])
+
+                final_h, final_w = to_tuple(target_size)
+
+                seg_img = Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy()))
+                seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST)
+
+                np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes()))
+                np_seg_img = np_seg_img.view(final_h, final_w, 3)
+                np_seg_img = np_seg_img.numpy()
+
+                m_id = torch.from_numpy(rgb_to_id(np_seg_img))
+
+                area = []
+                for i in range(len(scores)):
+                    area.append(m_id.eq(i).sum().item())
+                return area, seg_img
+
+            area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
+            if cur_classes.numel() > 0:
+                # We know filter empty masks as long as we find some
+                while True:
+                    filtered_small = torch.as_tensor(
+                        [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device
+                    )
+                    if filtered_small.any().item():
+                        cur_scores = cur_scores[~filtered_small]
+                        cur_classes = cur_classes[~filtered_small]
+                        cur_masks = cur_masks[~filtered_small]
+                        area, seg_img = get_ids_area(cur_masks, cur_scores)
+                    else:
+                        break
+
+            else:
+                cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device)
+
+            segments_info = []
+            for i, a in enumerate(area):
+                cat = cur_classes[i].item()
+                segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a})
+            del cur_classes
+
+            with io.BytesIO() as out:
+                seg_img.save(out, format="PNG")
+                predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+            preds.append(predictions)
+        return preds
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
new file mode 100644
index 0000000000000..1f63d22be27a5
--- /dev/null
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -0,0 +1,2547 @@
+# coding=utf-8
+# Copyright 2022 Attn4Vis and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Conditional DETR model."""
+
+
+import math
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_int_div
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_scipy_available,
+    is_timm_available,
+    is_vision_available,
+    logging,
+    replace_return_docstrings,
+    requires_backends,
+)
+from .configuration_conditional_detr import ConditionalDETRConfig
+
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+if is_vision_available():
+    from .feature_extraction_conditional_detr import center_to_corners_format
+
+if is_timm_available():
+    from timm import create_model
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "ConditionalDETRConfig"
+_CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
+
+CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Atten4Vis/ConditionalDETR",
+    # See all conditional_detr models at https://huggingface.co/models?filter=conditional_detr
+]
+
+
+
+@dataclass
+class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
+    """
+    Base class for outputs of the Conditional DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+            layernorm.
+    """
+
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    reference_points: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class ConditionalDETRModelOutput(Seq2SeqModelOutput):
+    """
+    Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+            layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+            layernorm.
+    """
+
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    reference_points: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->ConditionalDETR
+class ConditionalDETRObjectDetectionOutput(ModelOutput):
+    """
+    Output type of [`ConditionalDETRForObjectDetection`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
+            boxes.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+            layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.detr.modeling_detr.DetrSegmentationOutput with Detr->ConditionalDETR
+class ConditionalDETRSegmentationOutput(ModelOutput):
+    """
+    Output type of [`ConditionalDETRForSegmentation`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
+            boxes.
+        pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
+            Segmentation masks logits for all queries. See also [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
+            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation masks
+            respectively.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+            layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    pred_masks: torch.FloatTensor = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+# BELOW: utilities copied from
+# https://github.com/facebookresearch/detr/blob/master/backbone.py
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDETR
+class ConditionalDETRFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super(ConditionalDETRFrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(ConditionalDETRFrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->ConditionalDETR
+def replace_batch_norm(m, name=""):
+    for attr_str in dir(m):
+        target_attr = getattr(m, attr_str)
+        if isinstance(target_attr, nn.BatchNorm2d):
+            frozen = ConditionalDETRFrozenBatchNorm2d(target_attr.num_features)
+            bn = getattr(m, attr_str)
+            frozen.weight.data.copy_(bn.weight)
+            frozen.bias.data.copy_(bn.bias)
+            frozen.running_mean.data.copy_(bn.running_mean)
+            frozen.running_var.data.copy_(bn.running_var)
+            setattr(m, attr_str, frozen)
+    for n, ch in m.named_children():
+        replace_batch_norm(ch, n)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrTimmConvEncoder with Detr->ConditionalDETR
+class ConditionalDETRTimmConvEncoder(nn.Module):
+    """
+    Convolutional encoder (backbone) from the timm library.
+
+    nn.BatchNorm2d layers are replaced by ConditionalDETRFrozenBatchNorm2d as defined above.
+
+    """
+
+    def __init__(self, name: str, dilation: bool):
+        super().__init__()
+
+        kwargs = {}
+        if dilation:
+            kwargs["output_stride"] = 16
+
+        requires_backends(self, ["timm"])
+
+        backbone = create_model(name, pretrained=True, features_only=True, out_indices=(1, 2, 3, 4), **kwargs)
+        # replace batch norm by frozen batch norm
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = self.model.feature_info.channels()
+
+        if "resnet" in name:
+            for name, parameter in self.model.named_parameters():
+                if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                    parameter.requires_grad_(False)
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values)
+
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->ConditionalDETR
+class ConditionalDETRConvModel(nn.Module):
+    """
+    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
+    """
+
+    def __init__(self, conv_encoder, position_embedding):
+        super().__init__()
+        self.conv_encoder = conv_encoder
+        self.position_embedding = position_embedding
+
+    def forward(self, pixel_values, pixel_mask):
+        # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples
+        out = self.conv_encoder(pixel_values, pixel_mask)
+        pos = []
+        for feature_map, mask in out:
+            # position encoding
+            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
+
+        return out, pos
+
+# Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDETR
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrSinePositionEmbedding with Detr->ConditionalDETR
+class ConditionalDETRSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, pixel_values, pixel_mask):
+        if pixel_mask is None:
+            raise ValueError("No pixel mask provided")
+        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
+
+        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
+        dim_t = self.temperature ** (2 * torch_int_div(dim_t, 2) / self.embedding_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->ConditionalDETR
+class ConditionalDETRLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, embedding_dim=256):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+    def forward(self, pixel_values, pixel_mask=None):
+        h, w = pixel_values.shape[-2:]
+        i = torch.arange(w, device=pixel_values.device)
+        j = torch.arange(h, device=pixel_values.device)
+        x_emb = self.column_embeddings(i)
+        y_emb = self.row_embeddings(j)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(h, 1, 1), y_emb.unsqueeze(1).repeat(1, w, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->ConditionalDETR
+def build_position_encoding(config):
+    n_steps = config.d_model // 2
+    if config.position_embedding_type == "sine":
+        # TODO find a better way of exposing other arguments
+        position_embedding = ConditionalDETRSinePositionEmbedding(n_steps, normalize=True)
+    elif config.position_embedding_type == "learned":
+        position_embedding = ConditionalDETRLearnedPositionEmbedding(n_steps)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+
+    return position_embedding
+
+# function to generate sine positional embedding for 2d coordinates
+def gen_sineembed_for_position(pos_tensor):
+    scale = 2 * math.pi
+    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000 ** (2 * (dim_t // 2) / 128)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos = torch.cat((pos_y, pos_x), dim=2)
+    return pos
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1/x2)
+
+class DetrAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the CONDITIONAL_DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        key_value_states: Optional[torch.Tensor] = None,
+        key_value_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # add key-value position embeddings to the key value states
+        if key_value_position_embeddings is not None:
+            key_value_states_original = key_value_states
+            key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states_original), -1, bsz)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states_original), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+class ConditionalDETRAttention(nn.Module):
+    """
+    Cross-Attention used in Conditional DETR 'Conditional DETR for Fast Training Convergence' paper.
+
+    The key q_proj, k_proj, v_proj are defined outside the attention.
+    This attention allows the dim of q, k to be different to v.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        out_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+            )
+        # head dimension of values
+        self.v_head_dim = out_dim // num_heads
+        if self.v_head_dim * num_heads != self.out_dim:
+            raise ValueError(
+                f"out_dim must be divisible by num_heads (got `out_dim`: {self.out_dim} and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.out_proj = nn.Linear(out_dim, out_dim, bias=bias)
+
+    def _qk_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _v_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        key_states: Optional[torch.Tensor] = None,
+        value_states: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = hidden_states * self.scaling
+        # get key, value proj
+        key_states = self._qk_shape(key_states, -1, bsz)
+        value_states = self._v_shape(value_states, -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        v_proj_shape = (bsz * self.num_heads, -1, self.v_head_dim)
+        query_states = self._qk_shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*v_proj_shape)
+
+        src_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.v_head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.v_head_dim)}, but is {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.v_head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.out_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+class ConditionalDETREncoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = DetrAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: torch.Tensor = None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_embeddings (`torch.FloatTensor`, *optional*): position embeddings, to be added to hidden_states.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class ConditionalDETRDecoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        d_model = config.d_model
+        # Decoder Self-Attention projections
+        self.sa_qcontent_proj = nn.Linear(d_model, d_model)
+        self.sa_qpos_proj = nn.Linear(d_model, d_model)
+        self.sa_kcontent_proj = nn.Linear(d_model, d_model)
+        self.sa_kpos_proj = nn.Linear(d_model, d_model)
+        self.sa_v_proj = nn.Linear(d_model, d_model)
+        
+        self.self_attn = ConditionalDETRAttention(
+            embed_dim=self.embed_dim,
+            out_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        
+
+        # Decoder Cross-Attention projections
+        self.ca_qcontent_proj = nn.Linear(d_model, d_model)
+        self.ca_qpos_proj = nn.Linear(d_model, d_model)
+        self.ca_kcontent_proj = nn.Linear(d_model, d_model)
+        self.ca_kpos_proj = nn.Linear(d_model, d_model)
+        self.ca_v_proj = nn.Linear(d_model, d_model)
+        self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)
+
+        self.encoder_attn = ConditionalDETRAttention(
+            self.embed_dim*2,
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.nhead = config.decoder_attention_heads
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        query_position_embeddings: Optional[torch.Tensor] = None,
+        query_sine_embed: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        is_first: Optional[bool] = False
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys
+            in the cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys
+            in the self-attention layer.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # ========== Begin of Self-Attention =============
+        # Apply projections here
+        # shape: num_queries x batch_size x 256
+        q_content = self.sa_qcontent_proj(hidden_states)      # target is the input of the first decoder layer. zero by default.
+        q_pos = self.sa_qpos_proj(query_position_embeddings)
+        k_content = self.sa_kcontent_proj(hidden_states)
+        k_pos = self.sa_kpos_proj(query_position_embeddings)
+        v = self.sa_v_proj(hidden_states)
+
+        bs, num_queries, n_model = q_content.shape
+        _, hw, _ = k_content.shape
+
+        q = q_content + q_pos
+        k = k_content + k_pos
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=q,
+            attention_mask=attention_mask,
+            key_states=k,
+            value_states=v,
+            output_attentions=output_attentions,
+        )
+        # ============ End of Self-Attention =============
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # ========== Begin of Cross-Attention =============
+        # Apply projections here
+        # shape: num_queries x batch_size x 256
+        q_content = self.ca_qcontent_proj(hidden_states)
+        k_content = self.ca_kcontent_proj(encoder_hidden_states)
+        v = self.ca_v_proj(encoder_hidden_states)
+
+        bs, num_queries, n_model = q_content.shape
+        _, hw, _ = k_content.shape
+
+        k_pos = self.ca_kpos_proj(position_embeddings)
+
+        # For the first decoder layer, we concatenate the positional embedding predicted from 
+        # the object query (the positional embedding) into the original query (key) in DETR.
+        if is_first:
+            q_pos = self.ca_qpos_proj(query_position_embeddings)
+            q = q_content + q_pos
+            k = k_content + k_pos
+        else:
+            q = q_content
+            k = k_content
+
+        q = q.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        query_sine_embed = self.ca_qpos_sine_proj(query_sine_embed)
+        query_sine_embed = query_sine_embed.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        q = torch.cat([q, query_sine_embed], dim=3).view(bs, num_queries, n_model * 2)
+        k = k.view(bs, hw, self.nhead, n_model//self.nhead)
+        k_pos = k_pos.view(bs, hw, self.nhead, n_model//self.nhead)
+        k = torch.cat([k, k_pos], dim=3).view(bs, hw, n_model * 2)
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=q,
+                attention_mask=encoder_attention_mask,
+                key_states=k,
+                value_states=v,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # ============ End of Cross-Attention =============
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead with Detr->ConditionalDETR
+class ConditionalDETRClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->ConditionalDETR
+class ConditionalDETRPreTrainedModel(PreTrainedModel):
+    config_class = ConditionalDETRConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        xavier_std = self.config.init_xavier_std
+
+        if isinstance(module, ConditionalDETRMHAttentionMap):
+            nn.init.zeros_(module.k_linear.bias)
+            nn.init.zeros_(module.q_linear.bias)
+            nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
+            nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
+        elif isinstance(module, ConditionalDETRLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, ConditionalDETRDecoder):
+            module.gradient_checkpointing = value
+
+
+CONDITIONAL_DETR_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`ConditionalDETRConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CONDITIONAL_DETR_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it.
+
+            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See [`ConditionalDETRFeatureExtractor.__call__`] for
+            details.
+
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR
+class ConditionalDETREncoder(ConditionalDETRPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`ConditionalDETREncoderLayer`].
+
+    The encoder updates the flattened feature map through multiple self-attention layers.
+
+    Small tweak for CONDITIONAL_DETR:
+
+    - position_embeddings are added to the forward pass.
+
+    Args:
+        config: ConditionalDETRConfig
+    """
+
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        self.layers = nn.ModuleList([ConditionalDETREncoderLayer(config) for _ in range(config.encoder_layers)])
+
+        # in the original CONDITIONAL_DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_embeddings=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+
+                [What are attention masks?](../glossary#attention-mask)
+
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                # we add position_embeddings as extra input to the encoder_layer
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    position_embeddings=position_embeddings,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class ConditionalDETRDecoder(ConditionalDETRPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`ConditionalDETRDecoderLayer`].
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some small tweaks for CONDITIONAL_DETR:
+
+    - position_embeddings and query_position_embeddings are added to the forward pass.
+    - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
+
+    Args:
+        config: ConditionalDETRConfig
+    """
+
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+
+        self.layers = nn.ModuleList([ConditionalDETRDecoderLayer(config) for _ in range(config.decoder_layers)])
+        # in CONDITIONAL_DETR, the decoder uses layernorm after the last decoder layer output
+        self.layernorm = nn.LayerNorm(config.d_model)
+        d_model = config.d_model
+        self.gradient_checkpointing = False
+
+        # query_scale is the FFN applied on f to generate transformation T
+        self.query_scale = MLP(d_model, d_model, d_model, 2)
+        self.ref_point_head = MLP(d_model, d_model, 2, 2)
+        for layer_id in range(config.decoder_layers - 1):
+            self.layers[layer_id + 1].ca_qpos_proj = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings=None,
+        query_position_embeddings=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`:
+
+                - 1 for queries that are **not masked**,
+                - 0 for queries that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Position embeddings that are added to the queries and keys in each cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+            input_shape = inputs_embeds.size()[:-1]
+
+        combined_attention_mask = None
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # optional intermediate hidden states
+        intermediate = () if self.config.auxiliary_loss else None
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        reference_points_before_sigmoid = self.ref_point_head(query_position_embeddings)    # [num_queries, batch_size, 2]
+        reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
+        obj_center = reference_points[..., :2].transpose(0, 1)  
+        # get sine embedding for the query vector
+        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center) 
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+            if idx == 0:
+                pos_transformation = 1
+            else:
+                pos_transformation = self.query_scale(hidden_states)  
+            # apply transformation
+            query_sine_embed = query_sine_embed_before_transformation * pos_transformation
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    combined_attention_mask,
+                    position_embeddings,
+                    query_position_embeddings,
+                    query_sine_embed,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=combined_attention_mask,
+                    position_embeddings=position_embeddings,
+                    query_position_embeddings=query_position_embeddings,
+                    query_sine_embed=query_sine_embed,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    output_attentions=output_attentions,
+                    is_first=(idx==0)
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if self.config.auxiliary_loss:
+                hidden_states = self.layernorm(hidden_states)
+                intermediate += (hidden_states,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # finally, apply layernorm
+        hidden_states = self.layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # stack intermediate decoder activations
+        if self.config.auxiliary_loss:
+            intermediate = torch.stack(intermediate)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate, reference_points]
+                if v is not None
+            )
+        return ConditionalDETRDecoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+            intermediate_hidden_states=intermediate,
+            reference_points=reference_points
+        )
+
+
+@add_start_docstrings(
+    """
+    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
+    any specific head on top.
+    """,
+    CONDITIONAL_DETR_START_DOCSTRING,
+)
+class ConditionalDETRModel(ConditionalDETRPreTrainedModel):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        # Create backbone + positional encoding
+        backbone = ConditionalDETRTimmConvEncoder(config.backbone, config.dilation)
+        position_embeddings = build_position_encoding(config)
+        self.backbone = ConditionalDETRConvModel(backbone, position_embeddings)
+
+        # Create projection layer
+        self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
+
+        self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
+
+        self.encoder = ConditionalDETREncoder(config)
+        self.decoder = ConditionalDETRDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
+
+    @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ConditionalDETRModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), device=device)
+
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # pixel_values should be of shape (batch_size, num_channels, height, width)
+        # pixel_mask should be of shape (batch_size, height, width)
+        features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+
+        # get final feature map and downsampled mask
+        feature_map, mask = features[-1]
+
+        if mask is None:
+            raise ValueError("Backbone does not return downsampled pixel mask")
+
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        projected_feature_map = self.input_projection(feature_map)
+
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
+
+        flattened_mask = mask.flatten(1)
+
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                position_embeddings=position_embeddings,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1)
+        queries = torch.zeros_like(query_position_embeddings)
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            position_embeddings=position_embeddings,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return ConditionalDETRModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            reference_points=decoder_outputs.reference_points
+        )
+
+
+@add_start_docstrings(
+    """
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
+    such as COCO detection.
+    """,
+    CONDITIONAL_DETR_START_DOCSTRING,
+)
+# Copied from transformers.models.detr.modeling_detr.DetrForObjectDetection with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
+class ConditionalDETRForObjectDetection(ConditionalDETRPreTrainedModel):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        # CONDITIONAL_DETR encoder-decoder model
+        self.model = ConditionalDETRModel(config)
+
+        # Object detection heads
+        self.class_labels_classifier = nn.Linear(
+            config.d_model, config.num_labels
+        )  # We add one for the "no object" class
+        self.bbox_predictor = ConditionalDETRMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+
+    @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ConditionalDETRObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForObjectDetection
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> # model predicts bounding boxes and corresponding COCO classes
+        >>> logits = outputs.logits
+        >>> bboxes = outputs.pred_boxes
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # First, sent images through CONDITIONAL_DETR base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        # class logits + predicted bounding boxes
+        logits = self.class_labels_classifier(sequence_output)
+
+        reference = outputs.reference_points if return_dict else outputs[-1]
+        reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
+        outputs_coords = []
+        hs = sequence_output
+        tmp = self.bbox_predictor(hs)
+        tmp[..., :2] += reference_before_sigmoid
+        pred_boxes = tmp.sigmoid()
+        # pred_boxes = self.bbox_predictor(sequence_output).sigmoid()
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = ConditionalDETRHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality"]
+            criterion = ConditionalDETRLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                focal_alpha=self.config.focal_alpha,
+                losses=losses,
+            )
+            criterion.to(self.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            if self.config.auxiliary_loss:
+                intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4]
+                outputs_class = self.class_labels_classifier(intermediate)
+
+                for lvl in range(hs.shape[0]):
+                    tmp = self.bbox_predictor(hs[lvl])
+                    tmp[..., :2] += reference_before_sigmoid
+                    outputs_coord = tmp.sigmoid()
+                    outputs_coords.append(outputs_coord)
+                outputs_coord = torch.stack(outputs_coords)
+
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": self.config.cls_loss_coefficient, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return ConditionalDETRObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
+    such as COCO panoptic.
+
+    """,
+    CONDITIONAL_DETR_START_DOCSTRING,
+)
+# Copied from transformers.models.detr.modeling_detr.DetrForSegmentation with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
+class ConditionalDETRForSegmentation(ConditionalDETRPreTrainedModel):
+    def __init__(self, config: ConditionalDETRConfig):
+        super().__init__(config)
+
+        # object detection model
+        self.conditional_detr = ConditionalDETRForObjectDetection(config)
+
+        # segmentation head
+        hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
+        intermediate_channel_sizes = self.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes
+
+        self.mask_head = ConditionalDETRMaskHeadSmallConv(
+            hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
+        )
+
+        self.bbox_attention = ConditionalDETRMHAttentionMap(
+            hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ConditionalDETRSegmentationOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each
+            dictionary containing at least the following 3 keys: 'class_labels', 'boxes' and 'masks' (the class labels,
+            bounding boxes and segmentation masks of an image in the batch respectively). The class labels themselves
+            should be a `torch.LongTensor` of len `(number of bounding boxes in the image,)`, the boxes a
+            `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)` and the masks a
+            `torch.FloatTensor` of shape `(number of bounding boxes in the image, height, width)`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
+        >>> model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> # model predicts COCO classes, bounding boxes, and masks
+        >>> logits = outputs.logits
+        >>> bboxes = outputs.pred_boxes
+        >>> masks = outputs.pred_masks
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones((batch_size, height, width), device=device)
+
+        # First, get list of feature maps and position embeddings
+        features, position_embeddings_list = self.conditional_detr.model.backbone(pixel_values, pixel_mask=pixel_mask)
+
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        feature_map, mask = features[-1]
+        batch_size, num_channels, height, width = feature_map.shape
+        projected_feature_map = self.conditional_detr.model.input_projection(feature_map)
+
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
+
+        flattened_mask = mask.flatten(1)
+
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.conditional_detr.model.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                position_embeddings=position_embeddings,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.conditional_detr.model.query_position_embeddings.weight.unsqueeze(0).repeat(
+            batch_size, 1, 1
+        )
+        queries = torch.zeros_like(query_position_embeddings)
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.conditional_detr.model.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            position_embeddings=position_embeddings,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        # Sixth, compute logits, pred_boxes and pred_masks
+        logits = self.conditional_detr.class_labels_classifier(sequence_output)
+        pred_boxes = self.conditional_detr.bbox_predictor(sequence_output).sigmoid()
+
+        memory = encoder_outputs[0].permute(0, 2, 1).view(batch_size, self.config.d_model, height, width)
+        mask = flattened_mask.view(batch_size, height, width)
+
+        # FIXME h_boxes takes the last one computed, keep this in mind
+        # important: we need to reverse the mask, since in the original implementation the mask works reversed
+        # bbox_mask is of shape (batch_size, num_queries, number_of_attention_heads in bbox_attention, height/32, width/32)
+        bbox_mask = self.bbox_attention(sequence_output, memory, mask=~mask)
+
+        seg_masks = self.mask_head(projected_feature_map, bbox_mask, [features[2][0], features[1][0], features[0][0]])
+
+        pred_masks = seg_masks.view(batch_size, self.conditional_detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = ConditionalDETRHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality", "masks"]
+            criterion = ConditionalDETRLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                focal_alpha=self.config.focal_alpha,
+                losses=losses,
+            )
+            criterion.to(self.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            outputs_loss["pred_masks"] = pred_masks
+            if self.config.auxiliary_loss:
+                intermediate = decoder_outputs.intermediate_hidden_states if return_dict else decoder_outputs[-1]
+                outputs_class = self.class_labels_classifier(intermediate)
+                outputs_coord = self.bbox_predictor(intermediate).sigmoid()
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            weight_dict["loss_mask"] = self.config.mask_loss_coefficient
+            weight_dict["loss_dice"] = self.config.dice_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes, pred_masks) + auxiliary_outputs + decoder_outputs + encoder_outputs
+            else:
+                output = (logits, pred_boxes, pred_masks) + decoder_outputs + encoder_outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return ConditionalDETRSegmentationOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            pred_masks=pred_masks,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+def _expand(tensor, length: int):
+    return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1)
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/segmentation.py
+# Copied from transformers.models.detr.modeling_detr.DetrMaskHeadSmallConv with Detr->ConditionalDETR
+class ConditionalDETRMaskHeadSmallConv(nn.Module):
+    """
+    Simple convolutional head, using group norm. Upsampling is done using a FPN approach
+    """
+
+    def __init__(self, dim, fpn_dims, context_dim):
+        super().__init__()
+
+        if dim % 8 != 0:
+            raise ValueError(
+                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in GroupNorm is set to 8"
+            )
+
+        inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
+
+        self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
+        self.gn1 = nn.GroupNorm(8, dim)
+        self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
+        self.gn2 = nn.GroupNorm(8, inter_dims[1])
+        self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
+        self.gn3 = nn.GroupNorm(8, inter_dims[2])
+        self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
+        self.gn4 = nn.GroupNorm(8, inter_dims[3])
+        self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
+        self.gn5 = nn.GroupNorm(8, inter_dims[4])
+        self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)
+
+        self.dim = dim
+
+        self.adapter1 = nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
+        self.adapter2 = nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
+        self.adapter3 = nn.Conv2d(fpn_dims[2], inter_dims[3], 1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
+        # here we concatenate x, the projected feature map, of shape (batch_size, d_model, heigth/32, width/32) with
+        # the bbox_mask = the attention maps of shape (batch_size, n_queries, n_heads, height/32, width/32).
+        # We expand the projected feature map to match the number of heads.
+        x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
+
+        x = self.lay1(x)
+        x = self.gn1(x)
+        x = nn.functional.relu(x)
+        x = self.lay2(x)
+        x = self.gn2(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter1(fpns[0])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay3(x)
+        x = self.gn3(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter2(fpns[1])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay4(x)
+        x = self.gn4(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter3(fpns[2])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay5(x)
+        x = self.gn5(x)
+        x = nn.functional.relu(x)
+
+        x = self.out_lay(x)
+        return x
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->ConditionalDETR
+class ConditionalDETRMHAttentionMap(nn.Module):
+    """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
+
+    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.dropout = nn.Dropout(dropout)
+
+        self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+        self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+
+        self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5
+
+    def forward(self, q, k, mask: Optional[Tensor] = None):
+        q = self.q_linear(q)
+        k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
+        queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
+        keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
+        weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)
+
+        if mask is not None:
+            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf"))
+        weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
+        weights = self.dropout(weights)
+        return weights
+
+
+def dice_loss(inputs, targets, num_boxes):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_boxes
+
+
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+# taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
+class ConditionalDETRLoss(nn.Module):
+    """
+    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process happens in two steps: 1)
+    we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
+    of matched ground-truth / prediction (supervise class and box).
+
+
+
+    Args:
+        matcher (`ConditionalDETRHungarianMatcher`):
+            Module able to compute a matching between targets and proposals.
+        num_classes (`int`):
+            Number of object categories, omitting the special no-object category.
+        focal_alpha (`float`):
+            Alpha parmeter in focal loss.
+        losses (`List[str]`):
+            List of all the losses to be applied. See `get_loss` for a list of all available losses.
+    """
+
+    def __init__(self, matcher, num_classes, focal_alpha, losses):
+        super().__init__()
+        self.matcher = matcher
+        self.num_classes = num_classes
+        self.focal_alpha = focal_alpha
+        self.losses = losses
+
+    # removed logging parameter, which was part of the original implementation
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor of dim
+        [nb_target_boxes]
+        """
+        if "logits" not in outputs:
+            raise KeyError("No logits were found in the outputs")
+        src_logits = outputs["logits"]
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_o
+
+        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2]+1],
+                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:,:,:-1]
+        loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) * src_logits.shape[1]
+        losses = {"loss_ce": loss_ce}
+
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
+
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+        """
+        logits = outputs["logits"]
+        device = logits.device
+        tgt_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+        card_err = nn.functional.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
+
+        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
+        are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        if "pred_boxes" not in outputs:
+            raise KeyError("No predicted boxes found in outputs")
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        loss_bbox = nn.functional.l1_loss(src_boxes, target_boxes, reduction="none")
+
+        losses = {}
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(
+            generalized_box_iou(center_to_corners_format(src_boxes), center_to_corners_format(target_boxes))
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the masks: the focal loss and the dice loss.
+
+        Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
+        """
+        if "pred_masks" not in outputs:
+            raise KeyError("No predicted masks found in outputs")
+
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+
+        # upsample predictions to the target size
+        src_masks = nn.functional.interpolate(
+            src_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
+        )
+        src_masks = src_masks[:, 0].flatten(1)
+
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(src_masks.shape)
+        losses = {
+            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
+            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
+        }
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+            "masks": self.loss_masks,
+        }
+        if loss not in loss_map:
+            raise ValueError(f"Loss {loss} not supported")
+        return loss_map[loss](outputs, targets, indices, num_boxes)
+
+    def forward(self, outputs, targets):
+        """
+        This performs the loss computation.
+
+        Args:
+             outputs (`dict`, *optional*):
+                Dictionary of tensors, see the output specification of the model for the format.
+             targets (`List[dict]`, *optional*):
+                List of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the
+                losses applied, see each loss' doc.
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["class_labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        # (Niels): comment out function below, distributed training to be added
+        # if is_dist_avail_and_initialized():
+        #     torch.distributed.all_reduce(num_boxes)
+        # (Niels) in original implementation, num_boxes is divided by get_world_size()
+        num_boxes = torch.clamp(num_boxes, min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "auxiliary_outputs" in outputs:
+            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
+                indices = self.matcher(auxiliary_outputs, targets)
+                for loss in self.losses:
+                    if loss == "masks":
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->ConditionalDETR,detr->conditional_detr
+class ConditionalDETRMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+# taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/matcher.py
+class ConditionalDETRHungarianMatcher(nn.Module):
+    """
+    This class computes an assignment between the targets and the predictions of the network.
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
+    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+
+    Args:
+        class_cost:
+            The relative weight of the classification error in the matching cost.
+        bbox_cost:
+            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
+        giou_cost:
+            The relative weight of the giou loss of the bounding box in the matching cost.
+    """
+
+    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
+        super().__init__()
+        requires_backends(self, ["scipy"])
+
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        if class_cost == 0 or bbox_cost == 0 or giou_cost == 0:
+            raise ValueError("All costs of the Matcher can't be 0")
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Args:
+            outputs (`dict`):
+                A dictionary that contains at least these entries:
+                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
+            targets (`List[dict]`):
+                A list of targets (len(targets) = batch_size), where each target is a dict containing:
+                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
+                  ground-truth
+                 objects in the target) containing the class labels
+                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
+
+        Returns:
+            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
+            - index_i is the indices of the selected predictions (in order)
+            - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v["class_labels"] for v in targets])
+        tgt_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost.
+        alpha = 0.25
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        class_cost = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
+
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, tgt_bbox, p=1)
+
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(tgt_bbox))
+
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+# below: bounding box utilities taken from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+
+
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
+
+    Args:
+        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
+            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
+            < x2` and `0 <= y1 < y2`.
+
+    Returns:
+        `torch.FloatTensor`: a tensor containing the area for each box.
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
+    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
+
+    Returns:
+        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = box_iou(boxes1, boxes2)
+
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+# below: taken from https://github.com/facebookresearch/detr/blob/master/util/misc.py#L306
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    if tensor_list[0].ndim == 3:
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("Only 3-dimensional tensors are supported")
+    return NestedTensor(tensor, mask)
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index e1f4f3b1fd9fa..f61b8dfdda282 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -24,6 +24,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class ConditionalDETRFeatureExtractor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ConvNextFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/conditional_detr/__init__.py b/tests/models/conditional_detr/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
new file mode 100644
index 0000000000000..b5e19fe005da7
--- /dev/null
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -0,0 +1,338 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import pathlib
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ConditionalDETRFeatureExtractor
+
+
+class ConditionalDETRFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=18,
+        max_size=1333,  # by setting max_size > max_resolution we're effectively not testing this :p
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.max_size = max_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "max_size": self.max_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to ConditionalDETRFeatureExtractor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size * h / w)
+                expected_width = self.size
+            elif w > h:
+                expected_height = self.size
+                expected_width = int(self.size * w / h)
+            else:
+                expected_height = self.size
+                expected_width = self.size
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+
+@require_torch
+@require_vision
+class ConditionalDETRFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+
+    feature_extraction_class = ConditionalDETRFeatureExtractor if is_vision_available() else None
+
+    def setUp(self):
+        self.feature_extract_tester = ConditionalDETRFeatureExtractionTester(self)
+
+    @property
+    def feat_extract_dict(self):
+        return self.feature_extract_tester.prepare_feat_extract_dict()
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "image_mean"))
+        self.assertTrue(hasattr(feature_extractor, "image_std"))
+        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
+        self.assertTrue(hasattr(feature_extractor, "do_resize"))
+        self.assertTrue(hasattr(feature_extractor, "size"))
+        self.assertTrue(hasattr(feature_extractor, "max_size"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PIL images
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random numpy tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_equivalence_pad_and_create_pixel_mask(self):
+        # Initialize feature_extractors
+        feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
+        feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test whether the method "pad_and_return_pixel_mask" and calling the feature extractor return the same tensors
+        encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
+        encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
+
+        assert torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
+        assert torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
+
+    @slow
+    def test_call_pytorch_with_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"image_id": 39769, "annotations": target}
+
+        # encode them
+        feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+
+        # verify area
+        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
+        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+        assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+        # verify class_labels
+        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+        assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
+
+    @slow
+    def test_call_pytorch_with_coco_panoptic_annotations(self):
+        # prepare image, target and masks_path
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+
+        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
+
+        # encode them
+        # TODO replace by .from_pretrained Atten4Vis/ConditionalDETR-panoptic
+        feature_extractor = ConditionalDETRFeatureExtractor(format="coco_panoptic")
+        encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+
+        # verify area
+        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
+        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+        assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+        # verify class_labels
+        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+        assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)
+        # verify masks
+        expected_masks_sum = 822338
+        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
new file mode 100644
index 0000000000000..38933f26c6627
--- /dev/null
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -0,0 +1,538 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch CONDITIONAL_DETR model. """
+
+
+import inspect
+import math
+import unittest
+
+from transformers import ConditionalDETRConfig, is_timm_available, is_vision_available
+from transformers.testing_utils import require_timm, require_vision, slow, torch_device
+from transformers.utils import cached_property
+
+from ...generation.test_generation_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+
+
+if is_timm_available():
+    import torch
+
+    from transformers import ConditionalDETRForObjectDetection, ConditionalDETRForSegmentation, ConditionalDETRModel
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ConditionalDETRFeatureExtractor
+
+
+class ConditionalDETRModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=8,
+        is_training=True,
+        use_labels=True,
+        hidden_size=256,
+        num_hidden_layers=2,
+        num_attention_heads=8,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        num_queries=12,
+        num_channels=3,
+        min_size=200,
+        max_size=200,
+        n_targets=8,
+        num_labels=91,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_queries = num_queries
+        self.num_channels = num_channels
+        self.min_size = min_size
+        self.max_size = max_size
+        self.n_targets = n_targets
+        self.num_labels = num_labels
+
+        # we also set the expected seq length for both encoder and decoder
+        self.encoder_seq_length = math.ceil(self.min_size / 32) * math.ceil(self.max_size / 32)
+        self.decoder_seq_length = self.num_queries
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size])
+
+        pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device)
+
+        labels = None
+        if self.use_labels:
+            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
+            labels = []
+            for i in range(self.batch_size):
+                target = {}
+                target["class_labels"] = torch.randint(
+                    high=self.num_labels, size=(self.n_targets,), device=torch_device
+                )
+                target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device)
+                target["masks"] = torch.rand(self.n_targets, self.min_size, self.max_size, device=torch_device)
+                labels.append(target)
+
+        config = self.get_config()
+        return config, pixel_values, pixel_mask, labels
+
+    def get_config(self):
+        return ConditionalDETRConfig(
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            num_queries=self.num_queries,
+            num_labels=self.num_labels,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
+        return config, inputs_dict
+
+    def create_and_check_conditional_detr_model(self, config, pixel_values, pixel_mask, labels):
+        model = ConditionalDETRModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_conditional_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
+        model = ConditionalDETRForObjectDetection(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
+
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+
+@require_timm
+class ConditionalDETRModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            ConditionalDETRModel,
+            ConditionalDETRForObjectDetection,
+            ConditionalDETRForSegmentation,
+        )
+        if is_timm_available()
+        else ()
+    )
+    is_encoder_decoder = True
+    test_torchscript = False
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+
+    # special case for head models
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ in ["ConditionalDETRForObjectDetection", "ConditionalDETRForSegmentation"]:
+                labels = []
+                for i in range(self.model_tester.batch_size):
+                    target = {}
+                    target["class_labels"] = torch.ones(
+                        size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long
+                    )
+                    target["boxes"] = torch.ones(
+                        self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float
+                    )
+                    target["masks"] = torch.ones(
+                        self.model_tester.n_targets,
+                        self.model_tester.min_size,
+                        self.model_tester.max_size,
+                        device=torch_device,
+                        dtype=torch.float,
+                    )
+                    labels.append(target)
+                inputs_dict["labels"] = labels
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = ConditionalDETRModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ConditionalDETRConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_conditional_detr_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_conditional_detr_model(*config_and_inputs)
+
+    def test_conditional_detr_object_detection_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_conditional_detr_object_detection_head_model(*config_and_inputs)
+
+    @unittest.skip(reason="CONDITIONAL_DETR does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="CONDITIONAL_DETR does not have a get_input_embeddings method")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="CONDITIONAL_DETR is not a generative model")
+    def test_generate_without_input_ids(self):
+        pass
+
+    @unittest.skip(reason="CONDITIONAL_DETR does not use token embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    @slow
+    def test_model_outputs_equivalence(self):
+        # TODO Niels: fix me!
+        pass
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        decoder_seq_length = self.model_tester.decoder_seq_length
+        encoder_seq_length = self.model_tester.encoder_seq_length
+        decoder_key_length = self.model_tester.decoder_seq_length
+        encoder_key_length = self.model_tester.encoder_seq_length
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            if self.is_encoder_decoder:
+                correct_outlen = 6
+
+                # loss is at first position
+                if "labels" in inputs_dict:
+                    correct_outlen += 1  # loss is added to beginning
+                # Object Detection model returns pred_logits and pred_boxes
+                if model_class.__name__ == "ConditionalDETRForObjectDetection":
+                    correct_outlen += 1
+                # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks
+                if model_class.__name__ == "ConditionalDETRForSegmentation":
+                    correct_outlen += 2
+                if "past_key_values" in outputs:
+                    correct_outlen += 1  # past_key_values have been returned
+
+                self.assertEqual(out_len, correct_outlen)
+
+                # decoder attentions
+                decoder_attentions = outputs.decoder_attentions
+                self.assertIsInstance(decoder_attentions, (list, tuple))
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(decoder_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                )
+
+                # cross attentions
+                cross_attentions = outputs.cross_attentions
+                self.assertIsInstance(cross_attentions, (list, tuple))
+                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(cross_attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        decoder_seq_length,
+                        encoder_key_length,
+                    ],
+                )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        output = outputs[0]
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_attentions = outputs.encoder_attentions[0]
+        encoder_hidden_states.retain_grad()
+        encoder_attentions.retain_grad()
+
+        decoder_attentions = outputs.decoder_attentions[0]
+        decoder_attentions.retain_grad()
+
+        cross_attentions = outputs.cross_attentions[0]
+        cross_attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(encoder_attentions.grad)
+        self.assertIsNotNone(decoder_attentions.grad)
+        self.assertIsNotNone(cross_attentions.grad)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model.config.is_encoder_decoder:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                expected_arg_names.extend(
+                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
+                    if "head_mask" and "decoder_head_mask" in arg_names
+                    else []
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+            else:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_different_timm_backbone(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # let's pick a random timm backbone
+        config.backbone = "tf_mobilenetv3_small_075"
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if model_class.__name__ == "ConditionalDETRForObjectDetection":
+                expected_shape = (
+                    self.model_tester.batch_size,
+                    self.model_tester.num_queries,
+                    self.model_tester.num_labels,
+                )
+                self.assertEqual(outputs.logits.shape, expected_shape)
+
+            self.assertTrue(outputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        configs_no_init.init_xavier_std = 1e9
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if "bbox_attention" in name and "bias" not in name:
+                        self.assertLess(
+                            100000,
+                            abs(param.data.max().item()),
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+
+TOLERANCE = 1e-4
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_timm
+@require_vision
+@slow
+class ConditionalDETRModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return (
+            ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+            if is_vision_available()
+            else None
+        )
+
+    def test_inference_no_head(self):
+        model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**encoding)
+
+        expected_shape = torch.Size((1, 300, 256))
+        assert outputs.last_hidden_state.shape == expected_shape
+        expected_slice = torch.tensor(
+            [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
+    def test_inference_object_detection_head(self):
+        model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+        expected_slice_logits = torch.tensor(
+            [[-19.1194, -0.0893, -11.0154], [-17.3640, -1.8035, -14.0219], [-20.0461, -0.5837, -11.1060]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        expected_slice_boxes = torch.tensor(
+            [[0.4433, 0.5302, 0.8853], [0.5494, 0.2517, 0.0529], [0.4998, 0.5360, 0.9956]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+    def test_inference_panoptic_segmentation_head(self):
+        model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+        expected_slice_logits = torch.tensor(
+            [[-18.1565, -1.7568, -13.5029], [-16.8888, -1.4138, -14.1028], [-17.5709, -2.5080, -11.8654]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        expected_slice_boxes = torch.tensor(
+            [[0.5344, 0.1789, 0.9285], [0.4420, 0.0572, 0.0875], [0.6630, 0.6887, 0.1017]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+        expected_shape_masks = torch.Size((1, model.config.num_queries, 200, 267))
+        self.assertEqual(outputs.pred_masks.shape, expected_shape_masks)
+        expected_slice_masks = torch.tensor(
+            [[-7.7558, -10.8788, -11.9797], [-11.8881, -16.4329, -17.7451], [-14.7316, -19.7383, -20.3004]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_masks[0, 0, :3, :3], expected_slice_masks, atol=1e-3))

From aa64e297ab5d110cb2a1920b5dd3de95fbd32ca6 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:54:36 -0400
Subject: [PATCH 167/233] checked copies

---
 docs/source/en/index.mdx                      |  2 +-
 .../modeling_conditional_detr.py              | 63 ++++++++++++-------
 2 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 7830c57e6eabd..84d489e22ddf1 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -68,7 +68,7 @@ The documentation is organized into five sections:
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[conditional_detr](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 1f63d22be27a5..4ce1f8f3d5776 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -153,8 +153,8 @@ class ConditionalDETRObjectDetectionOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
-            boxes.
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            unnormalized bounding boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
@@ -217,12 +217,13 @@ class ConditionalDETRSegmentationOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the unnormalized bounding
-            boxes.
+            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            unnormalized bounding boxes.
         pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
-            Segmentation masks logits for all queries. See also [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
-            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation masks
-            respectively.
+            Segmentation masks logits for all queries. See also
+            [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
+            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation
+            masks respectively.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
@@ -326,7 +327,6 @@ def replace_batch_norm(m, name=""):
         replace_batch_norm(ch, n)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrTimmConvEncoder with Detr->ConditionalDETR
 class ConditionalDETRTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.
@@ -1581,7 +1581,6 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-# Copied from transformers.models.detr.modeling_detr.DetrForObjectDetection with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
 class ConditionalDETRForObjectDetection(ConditionalDETRPreTrainedModel):
     def __init__(self, config: ConditionalDETRConfig):
         super().__init__(config)
@@ -1670,7 +1669,7 @@ def forward(
 
         # class logits + predicted bounding boxes
         logits = self.class_labels_classifier(sequence_output)
-
+        
         reference = outputs.reference_points if return_dict else outputs[-1]
         reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
         outputs_coords = []
@@ -1755,7 +1754,6 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-# Copied from transformers.models.detr.modeling_detr.DetrForSegmentation with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR,detr->conditional_detr,facebook/detr-resnet-50->Atten4Vis/ConditionalDETR
 class ConditionalDETRForSegmentation(ConditionalDETRPreTrainedModel):
     def __init__(self, config: ConditionalDETRConfig):
         super().__init__(config)
@@ -1807,22 +1805,40 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
-        >>> from PIL import Image
+        >>> import io
         >>> import requests
+        >>> from PIL import Image
+        >>> import torch
+        >>> import numpy
+
+        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
+        >>> from transformers.models.conditional_detr.feature_extraction_conditional_detr import rgb_to_id
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
-        >>> model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic")
+        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained(
+        ...     "facebook/conditional_detr-resnet-50-panoptic"
+        ... )
+        >>> model = ConditionalDETRForSegmentation.from_pretrained("facebook/conditional_detr-resnet-50-panoptic")
 
+        >>> # prepare image for the model
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
+
+        >>> # forward pass
         >>> outputs = model(**inputs)
-        >>> # model predicts COCO classes, bounding boxes, and masks
-        >>> logits = outputs.logits
-        >>> bboxes = outputs.pred_boxes
-        >>> masks = outputs.pred_masks
+
+        >>> # use the `post_process_panoptic` method of `ConditionalDETRFeatureExtractor` to convert to COCO format
+        >>> processed_sizes = torch.as_tensor(inputs["pixel_values"].shape[-2:]).unsqueeze(0)
+        >>> result = feature_extractor.post_process_panoptic(outputs, processed_sizes)[0]
+
+        >>> # the segmentation is stored in a special-format png
+        >>> panoptic_seg = Image.open(io.BytesIO(result["png_string"]))
+        >>> panoptic_seg = numpy.array(panoptic_seg, dtype=numpy.uint8)
+        >>> # retrieve the ids corresponding to each mask
+        >>> panoptic_seg_id = rgb_to_id(panoptic_seg)
+        >>> panoptic_seg_id.shape
+        (800, 1066)
         ```"""
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1903,7 +1919,9 @@ def forward(
 
         seg_masks = self.mask_head(projected_feature_map, bbox_mask, [features[2][0], features[1][0], features[0][0]])
 
-        pred_masks = seg_masks.view(batch_size, self.conditional_detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+        pred_masks = seg_masks.view(
+            batch_size, self.conditional_detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1]
+        )
 
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
@@ -1985,7 +2003,8 @@ def __init__(self, dim, fpn_dims, context_dim):
 
         if dim % 8 != 0:
             raise ValueError(
-                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in GroupNorm is set to 8"
+                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in"
+                " GroupNorm is set to 8"
             )
 
         inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
@@ -2077,7 +2096,7 @@ def forward(self, q, k, mask: Optional[Tensor] = None):
         weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)
 
         if mask is not None:
-            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf"))
+            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min)
         weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
         weights = self.dropout(weights)
         return weights

From e70647f5ad76f24ddb968d1b48cd1eeb7d29acdd Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:55:14 -0400
Subject: [PATCH 168/233] checked copies

---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 4ce1f8f3d5776..1473df759cd8e 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -2347,7 +2347,6 @@ def forward(self, outputs, targets):
 
 
 # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
-# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->ConditionalDETR,detr->conditional_detr
 class ConditionalDETRMLPPredictionHead(nn.Module):
     """
     Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,

From 145ba8ae0c7a85a058b719fd59c253cbfc6d796f Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 01:22:01 -0400
Subject: [PATCH 169/233] fixed style and copies

---
 docs/source/en/_toctree.yml                   |   8 +-
 .../configuration_conditional_detr.py         |   8 -
 ..._original_pytorch_checkpoint_to_pytorch.py |  54 +++++--
 .../feature_extraction_conditional_detr.py    |   4 +-
 .../modeling_conditional_detr.py              | 144 +++++++++++-------
 5 files changed, 133 insertions(+), 85 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index c21388b60a6f9..f6d6a94eee334 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -42,7 +42,8 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - sections:
+    - isExpanded: false
+      sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -58,7 +59,6 @@
       - local: tasks/multiple_choice
         title: Multiple choice
       title: Task guides
-      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
@@ -362,6 +362,8 @@
       sections:
       - local: model_doc/beit
         title: BEiT
+      - local: model_doc/conditional_detr
+        title: ConditionalDETR
       - local: model_doc/convnext
         title: ConvNeXT
       - local: model_doc/cvt
@@ -501,4 +503,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
\ No newline at end of file
+  title: API
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index ce004dc6fc15c..bfb759ca246ca 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -211,14 +211,6 @@ def num_attention_heads(self) -> int:
     def hidden_size(self) -> int:
         return self.d_model
 
-    @property
-    def num_attention_heads(self) -> int:
-        return self.encoder_attention_heads
-
-    @property
-    def hidden_size(self) -> int:
-        return self.d_model
-
 
 class ConditionalDETROnnxConfig(OnnxConfig):
 
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index 2d7abc3c088b5..4b56d729d1f35 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -94,29 +94,55 @@
     rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
 
     # q, k, v projections in self/cross-attention in decoder for conditional DETR
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.weight", f"decoder.layers.{i}.sa_v_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight")
+    )
     # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.weight", f"decoder.layers.{i}.ca_qpos_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.weight", f"decoder.layers.{i}.ca_v_proj.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight")
+    )
 
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.bias", f"decoder.layers.{i}.sa_qpos_proj.bias"))
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.bias", f"decoder.layers.{i}.sa_kpos_proj.bias"))
     rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.bias", f"decoder.layers.{i}.sa_v_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias")
+    )
     # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.bias", f"decoder.layers.{i}.ca_qpos_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias")
+    )
     rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.bias", f"decoder.layers.{i}.ca_kpos_proj.bias"))
     rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.bias", f"decoder.layers.{i}.ca_v_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias")
+    )
 
 # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
 # for conditional DETR, also convert reference point head and query scale MLP
@@ -144,7 +170,7 @@
         ("transformer.decoder.query_scale.layers.1.weight", "decoder.query_scale.layers.1.weight"),
         ("transformer.decoder.query_scale.layers.1.bias", "decoder.query_scale.layers.1.bias"),
         ("transformer.decoder.layers.0.ca_qpos_proj.weight", "decoder.layers.0.ca_qpos_proj.weight"),
-        ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias")
+        ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias"),
     ]
 )
 
diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 07550124c157e..f37e3e1fb69bf 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -704,8 +704,8 @@ def post_process(self, outputs, target_sizes):
         topk_boxes = topk_indexes // out_logits.shape[2]
         labels = topk_indexes % out_logits.shape[2]
         boxes = center_to_corners_format(out_bbox)
-        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
-        
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
         # and from relative [0, 1] to absolute [0, height] coordinates
         img_h, img_w = target_sizes.unbind(1)
         scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 1473df759cd8e..54dea544f3da9 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -62,13 +62,13 @@
 ]
 
 
-
 @dataclass
 class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
     """
-    Base class for outputs of the Conditional DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
-    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
-    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+    Base class for outputs of the Conditional DETR decoder. This class adds one attribute to
+    BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output
+    of each decoder layer, each of them gone through a layernorm. This is useful when training the model with auxiliary
+    decoding losses.
 
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -97,9 +97,10 @@ class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
 @dataclass
 class ConditionalDETRModelOutput(Seq2SeqModelOutput):
     """
-    Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
-    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
-    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+    Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to
+    Seq2SeqModelOutput, namely an optional stack of intermediate decoder activations, i.e. the output of each decoder
+    layer, each of them gone through a layernorm. This is useful when training the model with auxiliary decoding
+    losses.
 
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -389,6 +390,7 @@ def forward(self, pixel_values, pixel_mask):
 
         return out, pos
 
+
 # Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDETR
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
@@ -479,6 +481,7 @@ def build_position_encoding(config):
 
     return position_embedding
 
+
 # function to generate sine positional embedding for 2d coordinates
 def gen_sineembed_for_position(pos_tensor):
     scale = 2 * math.pi
@@ -493,11 +496,13 @@ def gen_sineembed_for_position(pos_tensor):
     pos = torch.cat((pos_y, pos_x), dim=2)
     return pos
 
+
 def inverse_sigmoid(x, eps=1e-5):
     x = x.clamp(min=0, max=1)
     x1 = x.clamp(min=eps)
     x2 = (1 - x).clamp(min=eps)
-    return torch.log(x1/x2)
+    return torch.log(x1 / x2)
+
 
 class DetrAttention(nn.Module):
     """
@@ -521,7 +526,8 @@ def __init__(
         self.head_dim = embed_dim // num_heads
         if self.head_dim * num_heads != self.embed_dim:
             raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
             )
         self.scaling = self.head_dim**-0.5
 
@@ -585,7 +591,8 @@ def forward(
 
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
             )
 
         if attention_mask is not None:
@@ -614,7 +621,8 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
             )
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
@@ -625,12 +633,13 @@ def forward(
 
         return attn_output, attn_weights_reshaped
 
+
 class ConditionalDETRAttention(nn.Module):
     """
     Cross-Attention used in Conditional DETR 'Conditional DETR for Fast Training Convergence' paper.
 
-    The key q_proj, k_proj, v_proj are defined outside the attention.
-    This attention allows the dim of q, k to be different to v.
+    The key q_proj, k_proj, v_proj are defined outside the attention. This attention allows the dim of q, k to be
+    different to v.
     """
 
     def __init__(
@@ -649,7 +658,8 @@ def __init__(
         self.head_dim = embed_dim // num_heads
         if self.head_dim * num_heads != self.embed_dim:
             raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
             )
         # head dimension of values
         self.v_head_dim = out_dim // num_heads
@@ -663,6 +673,7 @@ def __init__(
 
     def _qk_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
     def _v_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim).transpose(1, 2).contiguous()
 
@@ -696,7 +707,8 @@ def forward(
 
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
             )
 
         if attention_mask is not None:
@@ -725,7 +737,8 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.v_head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.v_head_dim)}, but is {attn_output.size()}"
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.v_head_dim)}, but is"
+                f" {attn_output.size()}"
             )
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.v_head_dim)
@@ -818,19 +831,18 @@ def __init__(self, config: ConditionalDETRConfig):
         self.sa_kcontent_proj = nn.Linear(d_model, d_model)
         self.sa_kpos_proj = nn.Linear(d_model, d_model)
         self.sa_v_proj = nn.Linear(d_model, d_model)
-        
+
         self.self_attn = ConditionalDETRAttention(
             embed_dim=self.embed_dim,
             out_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
-            dropout=config.attention_dropout
+            dropout=config.attention_dropout,
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        
 
         # Decoder Cross-Attention projections
         self.ca_qcontent_proj = nn.Linear(d_model, d_model)
@@ -841,10 +853,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)
 
         self.encoder_attn = ConditionalDETRAttention(
-            self.embed_dim*2,
-            self.embed_dim,
-            config.decoder_attention_heads,
-            dropout=config.attention_dropout
+            self.embed_dim * 2, self.embed_dim, config.decoder_attention_heads, dropout=config.attention_dropout
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
@@ -862,7 +871,7 @@ def forward(
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
-        is_first: Optional[bool] = False
+        is_first: Optional[bool] = False,
     ):
         """
         Args:
@@ -888,7 +897,9 @@ def forward(
         # ========== Begin of Self-Attention =============
         # Apply projections here
         # shape: num_queries x batch_size x 256
-        q_content = self.sa_qcontent_proj(hidden_states)      # target is the input of the first decoder layer. zero by default.
+        q_content = self.sa_qcontent_proj(
+            hidden_states
+        )  # target is the input of the first decoder layer. zero by default.
         q_pos = self.sa_qpos_proj(query_position_embeddings)
         k_content = self.sa_kcontent_proj(hidden_states)
         k_pos = self.sa_kpos_proj(query_position_embeddings)
@@ -924,7 +935,7 @@ def forward(
 
         k_pos = self.ca_kpos_proj(position_embeddings)
 
-        # For the first decoder layer, we concatenate the positional embedding predicted from 
+        # For the first decoder layer, we concatenate the positional embedding predicted from
         # the object query (the positional embedding) into the original query (key) in DETR.
         if is_first:
             q_pos = self.ca_qpos_proj(query_position_embeddings)
@@ -934,12 +945,12 @@ def forward(
             q = q_content
             k = k_content
 
-        q = q.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        q = q.view(bs, num_queries, self.nhead, n_model // self.nhead)
         query_sine_embed = self.ca_qpos_sine_proj(query_sine_embed)
-        query_sine_embed = query_sine_embed.view(bs, num_queries, self.nhead, n_model//self.nhead)
+        query_sine_embed = query_sine_embed.view(bs, num_queries, self.nhead, n_model // self.nhead)
         q = torch.cat([q, query_sine_embed], dim=3).view(bs, num_queries, n_model * 2)
-        k = k.view(bs, hw, self.nhead, n_model//self.nhead)
-        k_pos = k_pos.view(bs, hw, self.nhead, n_model//self.nhead)
+        k = k.view(bs, hw, self.nhead, n_model // self.nhead)
+        k_pos = k_pos.view(bs, hw, self.nhead, n_model // self.nhead)
         k = torch.cat([k, k_pos], dim=3).view(bs, hw, n_model * 2)
 
         # Cross-Attention Block
@@ -996,8 +1007,9 @@ def forward(self, hidden_states: torch.Tensor):
         hidden_states = self.out_proj(hidden_states)
         return hidden_states
 
+
 class MLP(nn.Module):
-    """ Very simple multi-layer perceptron (also called FFN)"""
+    """Very simple multi-layer perceptron (also called FFN)"""
 
     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
         super().__init__()
@@ -1066,8 +1078,8 @@ def _set_gradient_checkpointing(self, module, value=False):
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it.
 
-            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See [`ConditionalDETRFeatureExtractor.__call__`] for
-            details.
+            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See
+            [`ConditionalDETRFeatureExtractor.__call__`] for details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -1323,11 +1335,13 @@ def forward(
         all_self_attns = () if output_attentions else None
         all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
 
-        reference_points_before_sigmoid = self.ref_point_head(query_position_embeddings)    # [num_queries, batch_size, 2]
+        reference_points_before_sigmoid = self.ref_point_head(
+            query_position_embeddings
+        )  # [num_queries, batch_size, 2]
         reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
-        obj_center = reference_points[..., :2].transpose(0, 1)  
+        obj_center = reference_points[..., :2].transpose(0, 1)
         # get sine embedding for the query vector
-        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center) 
+        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center)
 
         for idx, decoder_layer in enumerate(self.layers):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1339,7 +1353,7 @@ def forward(
             if idx == 0:
                 pos_transformation = 1
             else:
-                pos_transformation = self.query_scale(hidden_states)  
+                pos_transformation = self.query_scale(hidden_states)
             # apply transformation
             query_sine_embed = query_sine_embed_before_transformation * pos_transformation
             if self.gradient_checkpointing and self.training:
@@ -1372,7 +1386,7 @@ def custom_forward(*inputs):
                     encoder_hidden_states=encoder_hidden_states,
                     encoder_attention_mask=encoder_attention_mask,
                     output_attentions=output_attentions,
-                    is_first=(idx==0)
+                    is_first=(idx == 0),
                 )
 
             hidden_states = layer_outputs[0]
@@ -1401,7 +1415,14 @@ def custom_forward(*inputs):
         if not return_dict:
             return tuple(
                 v
-                for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate, reference_points]
+                for v in [
+                    hidden_states,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                    intermediate,
+                    reference_points,
+                ]
                 if v is not None
             )
         return ConditionalDETRDecoderOutput(
@@ -1410,14 +1431,14 @@ def custom_forward(*inputs):
             attentions=all_self_attns,
             cross_attentions=all_cross_attentions,
             intermediate_hidden_states=intermediate,
-            reference_points=reference_points
+            reference_points=reference_points,
         )
 
 
 @add_start_docstrings(
     """
-    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
-    any specific head on top.
+    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+    hidden-states without any specific head on top.
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
@@ -1570,14 +1591,14 @@ def forward(
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
             intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
-            reference_points=decoder_outputs.reference_points
+            reference_points=decoder_outputs.reference_points,
         )
 
 
 @add_start_docstrings(
     """
-    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
-    such as COCO detection.
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
+    top, for tasks such as COCO detection.
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
@@ -1669,7 +1690,7 @@ def forward(
 
         # class logits + predicted bounding boxes
         logits = self.class_labels_classifier(sequence_output)
-        
+
         reference = outputs.reference_points if return_dict else outputs[-1]
         reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
         outputs_coords = []
@@ -1748,8 +1769,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
-    such as COCO panoptic.
+    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top,
+    for tasks such as COCO panoptic.
 
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
@@ -2154,9 +2175,9 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
 # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
 class ConditionalDETRLoss(nn.Module):
     """
-    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process happens in two steps: 1)
-    we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
-    of matched ground-truth / prediction (supervise class and box).
+    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process
+    happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2)
+    we supervise each pair of matched ground-truth / prediction (supervise class and box).
 
 
 
@@ -2181,8 +2202,8 @@ def __init__(self, matcher, num_classes, focal_alpha, losses):
     # removed logging parameter, which was part of the original implementation
     def loss_labels(self, outputs, targets, indices, num_boxes):
         """
-        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor of dim
-        [nb_target_boxes]
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
+        of dim [nb_target_boxes]
         """
         if "logits" not in outputs:
             raise KeyError("No logits were found in the outputs")
@@ -2195,12 +2216,19 @@ def loss_labels(self, outputs, targets, indices, num_boxes):
         )
         target_classes[idx] = target_classes_o
 
-        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2]+1],
-                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
+        target_classes_onehot = torch.zeros(
+            [src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],
+            dtype=src_logits.dtype,
+            layout=src_logits.layout,
+            device=src_logits.device,
+        )
         target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
 
-        target_classes_onehot = target_classes_onehot[:,:,:-1]
-        loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) * src_logits.shape[1]
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = (
+            sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
+            * src_logits.shape[1]
+        )
         losses = {"loss_ce": loss_ce}
 
         return losses
@@ -2430,7 +2458,7 @@ def forward(self, outputs, targets):
         # Compute the classification cost.
         alpha = 0.25
         gamma = 2.0
-        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
         pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
         class_cost = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
 

From 9de75d15de4cf8f502ea50c67715f2875f3522e2 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 02:14:09 -0400
Subject: [PATCH 170/233] fixed style and copies

---
 .../conditional_detr/configuration_conditional_detr.py    | 8 ++++----
 ...itional_detr_original_pytorch_checkpoint_to_pytorch.py | 1 +
 utils/check_repo.py                                       | 2 ++
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index bfb759ca246ca..844260f0d12f5 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,7 +27,7 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DeppMeng/ConditionalDETR": "https://huggingface.co/DeppMeng/ConditionalDETR/resolve/main/config.json",
+    "DepuMeng/ConditionalDETR": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
 }
 
 
@@ -36,7 +36,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
     a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
-    [DeppMeng/ConditionalDETR](https://huggingface.co/DeppMeng/ConditionalDETR) architecture.
+    [DepuMeng/ConditionalDETR](https://huggingface.co/DepuMeng/ConditionalDETR) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -114,10 +114,10 @@ class ConditionalDETRConfig(PretrainedConfig):
     ```python
     >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
 
-    >>> # Initializing a CONDITIONAL_DETR DeppMeng/ConditionalDETR style configuration
+    >>> # Initializing a CONDITIONAL_DETR DepuMeng/ConditionalDETR style configuration
     >>> configuration = ConditionalDETRConfig()
 
-    >>> # Initializing a model from the DeppMeng/ConditionalDETR style configuration
+    >>> # Initializing a model from the DepuMeng/ConditionalDETR style configuration
     >>> model = ConditionalDETRModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index 4b56d729d1f35..f3c6fcadb2e2d 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -293,6 +293,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
     model = ConditionalDETRForSegmentation(config) if is_panoptic else ConditionalDETRForObjectDetection(config)
     model.load_state_dict(state_dict)
     model.eval()
+    model.push_to_hub(repo_id=model_name, organization="DepuMeng", commit_message="Add model")
     # verify our conversion
     original_outputs = conditional_detr(pixel_values)
     outputs = model(pixel_values)
diff --git a/utils/check_repo.py b/utils/check_repo.py
index ea3b997f1f126..ffc47aa72af8f 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -60,6 +60,8 @@
     "DetrEncoder",  # Building part of bigger (tested) model.
     "DetrDecoder",  # Building part of bigger (tested) model.
     "DetrDecoderWrapper",  # Building part of bigger (tested) model.
+    "ConditionalDETREncoder",  # Building part of bigger (tested) model.
+    "ConditionalDETRDecoder",  # Building part of bigger (tested) model.
     "M2M100Encoder",  # Building part of bigger (tested) model.
     "M2M100Decoder",  # Building part of bigger (tested) model.
     "MCTCTEncoder",  # Building part of bigger (tested) model.

From 52764fa4d0a5ff91b0c1f315c2478e736a87ce66 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 02:18:56 -0400
Subject: [PATCH 171/233] fixed hub

---
 .../conditional_detr/configuration_conditional_detr.py    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 844260f0d12f5..3c05d959879ec 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,7 +27,7 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DepuMeng/ConditionalDETR": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
+    "DepuMeng/conditional_detr_resnet50": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
 }
 
 
@@ -36,7 +36,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
     a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
-    [DepuMeng/ConditionalDETR](https://huggingface.co/DepuMeng/ConditionalDETR) architecture.
+    [DepuMeng/conditional_detr_resnet50](https://huggingface.co/DepuMeng/conditional_detr_resnet50) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -114,10 +114,10 @@ class ConditionalDETRConfig(PretrainedConfig):
     ```python
     >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
 
-    >>> # Initializing a CONDITIONAL_DETR DepuMeng/ConditionalDETR style configuration
+    >>> # Initializing a CONDITIONAL_DETR DepuMeng/conditional_detr_resnet50 style configuration
     >>> configuration = ConditionalDETRConfig()
 
-    >>> # Initializing a model from the DepuMeng/ConditionalDETR style configuration
+    >>> # Initializing a model from the DepuMeng/conditional_detr_resnet50 style configuration
     >>> model = ConditionalDETRModel(configuration)
 
     >>> # Accessing the model configuration

From 4832c6d9bcde5d9705c761a37731f249f17da7fa Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 02:27:32 -0400
Subject: [PATCH 172/233] fixed style

---
 .../models/conditional_detr/configuration_conditional_detr.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 3c05d959879ec..4291a6ecc4a94 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,7 +27,9 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DepuMeng/conditional_detr_resnet50": "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json",
+    "DepuMeng/conditional_detr_resnet50": (
+        "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json"
+    ),
 }
 
 

From 758bb442e8da1be759e98465d691691d4836327d Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:33:08 -0400
Subject: [PATCH 173/233] Update README.md

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a481458cde19e..ec8a0fd2e8b39 100644
--- a/README.md
+++ b/README.md
@@ -278,7 +278,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.

From 241cb6dd638cc6b141ffbad287c0569ce15871fa Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:33:17 -0400
Subject: [PATCH 174/233] Update docs/source/en/_toctree.yml

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/_toctree.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index f6d6a94eee334..d158be23cea58 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -363,7 +363,7 @@
       - local: model_doc/beit
         title: BEiT
       - local: model_doc/conditional_detr
-        title: ConditionalDETR
+        title: Conditional DETR
       - local: model_doc/convnext
         title: ConvNeXT
       - local: model_doc/cvt

From 4e64ef39b3db3b54ff68908d87e46a6d90c72487 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:33:58 -0400
Subject: [PATCH 175/233] Update docs/source/en/index.mdx

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/index.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 84d489e22ddf1..4d3aa7711503d 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -68,7 +68,7 @@ The documentation is organized into five sections:
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.

From e0aecc4584e437aa80daba1f2f9e17bed8a674a2 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:34:10 -0400
Subject: [PATCH 176/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 4291a6ecc4a94..722608a7c945e 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Facebook AI Research and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From aa3b2188f9809eb9be7337b391d5bb23cb9ba9ad Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:34:32 -0400
Subject: [PATCH 177/233] Update
 src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 ...t_conditional_detr_original_pytorch_checkpoint_to_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index f3c6fcadb2e2d..bcb2bbdda9a7d 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert CONDITIONAL_DETR checkpoints."""
+"""Convert Conditional DETR checkpoints."""
 
 
 import argparse

From 7812e53b3e111b811db10f6a2dbf231c9483b619 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:37:15 -0400
Subject: [PATCH 178/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 722608a7c945e..278ed04c14a35 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" CONDITIONAL_DETR model configuration"""
+""" Conditional DETR model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping

From d873c12742e7c015ab20bc68b27fa17d388a08e4 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:48:44 -0400
Subject: [PATCH 179/233] fixed some doc issue

---
 docs/source/en/_toctree.yml                                 | 6 +++---
 src/transformers/models/auto/feature_extraction_auto.py     | 2 --
 .../conditional_detr/configuration_conditional_detr.py      | 4 ++--
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index d158be23cea58..dfbdd503e6005 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -42,8 +42,7 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - isExpanded: false
-      sections:
+    - sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -58,7 +57,8 @@
         title: Summarization
       - local: tasks/multiple_choice
         title: Multiple choice
-      title: Task guides
+      title: Task guides      
+      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 9ba8295b406da..015fd132ef0dc 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,8 +39,6 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
-        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
-        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 278ed04c14a35..2d958a0fbe0f5 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,8 +27,8 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "DepuMeng/conditional_detr_resnet50": (
-        "https://huggingface.co/DepuMeng/conditional_detr_resnet50/resolve/main/config.json"
+    "microsoft/conditional_detr_resnet50": (
+        "https://huggingface.co/microsoft/conditional_detr_resnet50/resolve/main/config.json"
     ),
 }
 

From 143bda2018f6b98d5ae65e51e7bb49281fea656c Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:36 -0400
Subject: [PATCH 180/233] Update docs/source/en/model_doc/conditional_detr.mdx

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/model_doc/conditional_detr.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/conditional_detr.mdx b/docs/source/en/model_doc/conditional_detr.mdx
index 51316a24e43ee..53129b77ea05a 100644
--- a/docs/source/en/model_doc/conditional_detr.mdx
+++ b/docs/source/en/model_doc/conditional_detr.mdx
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The conditional DETR model was proposed in [ConditionalDETR](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR.
+The Conditional DETR model was proposed in [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR.
 
 The abstract from the paper is the following:
 

From 8c24e88f03d5d0d5959c42137b50d97995fe40f1 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:42 -0400
Subject: [PATCH 181/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/configuration_conditional_detr.py      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 2d958a0fbe0f5..e2a005215d0e0 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -36,9 +36,9 @@
 class ConditionalDETRConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
-    a CONDITIONAL_DETR model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the CONDITIONAL_DETR
-    [DepuMeng/conditional_detr_resnet50](https://huggingface.co/DepuMeng/conditional_detr_resnet50) architecture.
+    a Conditional DETR model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Conditional DETR
+    [microsoft/conditional-detr-resnet-50](https://huggingface.co/microsoft/conditional-detr-resnet-50) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.

From daa2da0a65eb03fc8601fb929ea2878c015bfa7a Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:54 -0400
Subject: [PATCH 182/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index e2a005215d0e0..d6013fa7e68ea 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -116,7 +116,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     ```python
     >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
 
-    >>> # Initializing a CONDITIONAL_DETR DepuMeng/conditional_detr_resnet50 style configuration
+    >>> # Initializing a Conditional DETR microsoft/conditional-detr-resnet-50 style configuration
     >>> configuration = ConditionalDETRConfig()
 
     >>> # Initializing a model from the DepuMeng/conditional_detr_resnet50 style configuration

From 641f36d5264a9af75e01a03b2d7de7665ce785c9 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:47:59 -0400
Subject: [PATCH 183/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index d6013fa7e68ea..4e0b4674356d9 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -119,7 +119,7 @@ class ConditionalDETRConfig(PretrainedConfig):
     >>> # Initializing a Conditional DETR microsoft/conditional-detr-resnet-50 style configuration
     >>> configuration = ConditionalDETRConfig()
 
-    >>> # Initializing a model from the DepuMeng/conditional_detr_resnet50 style configuration
+    >>> # Initializing a model from the microsoft/conditional-detr-resnet-50 style configuration
     >>> model = ConditionalDETRModel(configuration)
 
     >>> # Accessing the model configuration

From b3f0a6c86a804ac64edfa69e110c1cfd42d00eed Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 11:19:56 -0400
Subject: [PATCH 184/233] changed prefix to ConditionalDetr

---
 README.md                                     |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   3 +-
 README_zh-hant.md                             |   3 +-
 docs/source/en/_toctree.yml                   |   4 +-
 docs/source/en/index.mdx                      |   1 +
 docs/source/en/model_doc/conditional_detr.mdx |  20 +-
 src/transformers/__init__.py                  |  24 +-
 .../models/auto/configuration_auto.py         |   2 +-
 src/transformers/models/auto/modeling_auto.py |   6 +-
 .../models/conditional_detr/__init__.py       |  28 +--
 .../configuration_conditional_detr.py         |  14 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |  14 +-
 .../feature_extraction_conditional_detr.py    |  26 +--
 .../modeling_conditional_detr.py              | 212 +++++++++---------
 .../utils/dummy_vision_objects.py             |   2 +-
 ...est_feature_extraction_conditional_detr.py |  16 +-
 .../test_modeling_conditional_detr.py         |  44 ++--
 utils/check_repo.py                           |   4 +-
 19 files changed, 215 insertions(+), 210 deletions(-)

diff --git a/README.md b/README.md
index ec8a0fd2e8b39..1ba4614902013 100644
--- a/README.md
+++ b/README.md
@@ -279,6 +279,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_ko.md b/README_ko.md
index de1f2b3e13560..51ad1d1b18693 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -228,6 +228,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index f0ffb27c4101f..b2bf8f6043e0d 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -252,7 +252,8 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang发布。
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 发布。
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 03687087f7a22..366352ef5eb4e 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -264,7 +264,8 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index dfbdd503e6005..a4cd1005e3e83 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -57,7 +57,7 @@
         title: Summarization
       - local: tasks/multiple_choice
         title: Multiple choice
-      title: Task guides      
+      title: Task guides
       isExpanded: false
     title: Natural Language Processing
   - sections:
@@ -503,4 +503,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
+  title: API
\ No newline at end of file
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 4d3aa7711503d..579c6172d6a6b 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -69,6 +69,7 @@ The documentation is organized into five sections:
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/model_doc/conditional_detr.mdx b/docs/source/en/model_doc/conditional_detr.mdx
index 53129b77ea05a..d5846cbfee327 100644
--- a/docs/source/en/model_doc/conditional_detr.mdx
+++ b/docs/source/en/model_doc/conditional_detr.mdx
@@ -24,30 +24,30 @@ The abstract from the paper is the following:
 This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The original code can be found [here](https://github.com/Atten4Vis/ConditionalDETR).
 
 
-## ConditionalDETRConfig
+## ConditionalDetrConfig
 
-[[autodoc]] ConditionalDETRConfig
+[[autodoc]] ConditionalDetrConfig
 
-## ConditionalDETRFeatureExtractor
+## ConditionalDetrFeatureExtractor
 
-[[autodoc]] ConditionalDETRFeatureExtractor
+[[autodoc]] ConditionalDetrFeatureExtractor
     - __call__
     - pad_and_create_pixel_mask
     - post_process
     - post_process_segmentation
     - post_process_panoptic
 
-## ConditionalDETRModel
+## ConditionalDetrModel
 
-[[autodoc]] ConditionalDETRModel
+[[autodoc]] ConditionalDetrModel
     - forward
 
-## ConditionalDETRForObjectDetection
+## ConditionalDetrForObjectDetection
 
-[[autodoc]] ConditionalDETRForObjectDetection
+[[autodoc]] ConditionalDetrForObjectDetection
     - forward
 
-## ConditionalDETRForSegmentation
+## ConditionalDetrForSegmentation
 
-[[autodoc]] ConditionalDETRForSegmentation
+[[autodoc]] ConditionalDetrForSegmentation
     - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index b96724b992093..6abc53c85008e 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -172,7 +172,7 @@
         "CLIPVisionConfig",
     ],
     "models.codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenTokenizer"],
-    "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDETRConfig"],
+    "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDetrConfig"],
     "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
     "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
     "models.cpm": [],
@@ -661,7 +661,7 @@
     _import_structure["models.convnext"].append("ConvNextFeatureExtractor")
     _import_structure["models.deit"].append("DeiTFeatureExtractor")
     _import_structure["models.detr"].append("DetrFeatureExtractor")
-    _import_structure["models.conditional_detr"].append("ConditionalDETRFeatureExtractor")
+    _import_structure["models.conditional_detr"].append("ConditionalDetrFeatureExtractor")
     _import_structure["models.donut"].append("DonutFeatureExtractor")
     _import_structure["models.dpt"].append("DPTFeatureExtractor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaProcessor"])
@@ -713,10 +713,10 @@
     _import_structure["models.conditional_detr"].extend(
         [
             "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "ConditionalDETRForObjectDetection",
-            "ConditionalDETRForSegmentation",
-            "ConditionalDETRModel",
-            "ConditionalDETRPreTrainedModel",
+            "ConditionalDetrForObjectDetection",
+            "ConditionalDetrForSegmentation",
+            "ConditionalDetrModel",
+            "ConditionalDetrPreTrainedModel",
         ]
     )
 
@@ -3086,7 +3086,7 @@
         CLIPVisionConfig,
     )
     from .models.codegen import CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP, CodeGenConfig, CodeGenTokenizer
-    from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDETRConfig
+    from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDetrConfig
     from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
     from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
@@ -3510,7 +3510,7 @@
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
         from .models.clip import CLIPFeatureExtractor
-        from .models.conditional_detr import ConditionalDETRFeatureExtractor
+        from .models.conditional_detr import ConditionalDetrFeatureExtractor
         from .models.convnext import ConvNextFeatureExtractor
         from .models.deit import DeiTFeatureExtractor
         from .models.detr import DetrFeatureExtractor
@@ -3542,10 +3542,10 @@
     else:
         from .models.conditional_detr import (
             CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ConditionalDETRForObjectDetection,
-            ConditionalDETRForSegmentation,
-            ConditionalDETRModel,
-            ConditionalDETRPreTrainedModel,
+            ConditionalDetrForObjectDetection,
+            ConditionalDetrForSegmentation,
+            ConditionalDetrModel,
+            ConditionalDetrPreTrainedModel,
         )
         from .models.deformable_detr import (
             DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index b28bb0665bc3d..258fb5d645bf1 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -43,7 +43,7 @@
         ("canine", "CanineConfig"),
         ("clip", "CLIPConfig"),
         ("codegen", "CodeGenConfig"),
-        ("conditional_detr", "ConditionalDETRConfig"),
+        ("conditional_detr", "ConditionalDetrConfig"),
         ("convbert", "ConvBertConfig"),
         ("convnext", "ConvNextConfig"),
         ("ctrl", "CTRLConfig"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 322a48f1cefeb..8ee36125de25d 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -42,7 +42,7 @@
         ("canine", "CanineModel"),
         ("clip", "CLIPModel"),
         ("codegen", "CodeGenModel"),
-        ("conditional_detr", "ConditionalDETRModel"),
+        ("conditional_detr", "ConditionalDetrModel"),
         ("convbert", "ConvBertModel"),
         ("convnext", "ConvNextModel"),
         ("ctrl", "CTRLModel"),
@@ -374,7 +374,7 @@
     [
         # Do not add new models here, this class will be deprecated in the future.
         # Model for Image Segmentation mapping
-        ("conditional_detr", "ConditionalDETRForSegmentation"),
+        ("conditional_detr", "ConditionalDetrForSegmentation"),
         ("detr", "DetrForSegmentation"),
     ]
 )
@@ -457,7 +457,7 @@
 MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Object Detection mapping
-        ("conditional_detr", "ConditionalDETRForObjectDetection"),
+        ("conditional_detr", "ConditionalDetrForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
diff --git a/src/transformers/models/conditional_detr/__init__.py b/src/transformers/models/conditional_detr/__init__.py
index 1b72e7d7a3238..c2f1bdfdbbaae 100644
--- a/src/transformers/models/conditional_detr/__init__.py
+++ b/src/transformers/models/conditional_detr/__init__.py
@@ -24,8 +24,8 @@
 _import_structure = {
     "configuration_conditional_detr": [
         "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "ConditionalDETRConfig",
-        "ConditionalDETROnnxConfig",
+        "ConditionalDetrConfig",
+        "ConditionalDetrOnnxConfig",
     ]
 }
 
@@ -35,7 +35,7 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["feature_extraction_conditional_detr"] = ["ConditionalDETRFeatureExtractor"]
+    _import_structure["feature_extraction_conditional_detr"] = ["ConditionalDetrFeatureExtractor"]
 
 try:
     if not is_timm_available():
@@ -45,18 +45,18 @@
 else:
     _import_structure["modeling_conditional_detr"] = [
         "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "ConditionalDETRForObjectDetection",
-        "ConditionalDETRForSegmentation",
-        "ConditionalDETRModel",
-        "ConditionalDETRPreTrainedModel",
+        "ConditionalDetrForObjectDetection",
+        "ConditionalDetrForSegmentation",
+        "ConditionalDetrModel",
+        "ConditionalDetrPreTrainedModel",
     ]
 
 
 if TYPE_CHECKING:
     from .configuration_conditional_detr import (
         CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        ConditionalDETRConfig,
-        ConditionalDETROnnxConfig,
+        ConditionalDetrConfig,
+        ConditionalDetrOnnxConfig,
     )
 
     try:
@@ -65,7 +65,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .feature_extraction_conditional_detr import ConditionalDETRFeatureExtractor
+        from .feature_extraction_conditional_detr import ConditionalDetrFeatureExtractor
 
     try:
         if not is_timm_available():
@@ -75,10 +75,10 @@
     else:
         from .modeling_conditional_detr import (
             CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ConditionalDETRForObjectDetection,
-            ConditionalDETRForSegmentation,
-            ConditionalDETRModel,
-            ConditionalDETRPreTrainedModel,
+            ConditionalDetrForObjectDetection,
+            ConditionalDetrForSegmentation,
+            ConditionalDetrModel,
+            ConditionalDetrPreTrainedModel,
         )
 
 else:
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 4e0b4674356d9..6abcf85894735 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -33,9 +33,9 @@
 }
 
 
-class ConditionalDETRConfig(PretrainedConfig):
+class ConditionalDetrConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`ConditionalDETRModel`]. It is used to instantiate
+    This is the configuration class to store the configuration of a [`ConditionalDetrModel`]. It is used to instantiate
     a Conditional DETR model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Conditional DETR
     [microsoft/conditional-detr-resnet-50](https://huggingface.co/microsoft/conditional-detr-resnet-50) architecture.
@@ -48,7 +48,7 @@ class ConditionalDETRConfig(PretrainedConfig):
             The number of input channels.
         num_queries (`int`, *optional*, defaults to 100):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
-            [`ConditionalDETRModel`] can detect in a single image. For COCO, we recommend 100 queries.
+            [`ConditionalDetrModel`] can detect in a single image. For COCO, we recommend 100 queries.
         d_model (`int`, *optional*, defaults to 256):
             Dimension of the layers.
         encoder_layers (`int`, *optional*, defaults to 6):
@@ -114,13 +114,13 @@ class ConditionalDETRConfig(PretrainedConfig):
     Examples:
 
     ```python
-    >>> from transformers import ConditionalDETRModel, ConditionalDETRConfig
+    >>> from transformers import ConditionalDetrModel, ConditionalDetrConfig
 
     >>> # Initializing a Conditional DETR microsoft/conditional-detr-resnet-50 style configuration
-    >>> configuration = ConditionalDETRConfig()
+    >>> configuration = ConditionalDetrConfig()
 
     >>> # Initializing a model from the microsoft/conditional-detr-resnet-50 style configuration
-    >>> model = ConditionalDETRModel(configuration)
+    >>> model = ConditionalDetrModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
@@ -214,7 +214,7 @@ def hidden_size(self) -> int:
         return self.d_model
 
 
-class ConditionalDETROnnxConfig(OnnxConfig):
+class ConditionalDetrOnnxConfig(OnnxConfig):
 
     torch_onnx_minimum_version = version.parse("1.11")
 
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index bcb2bbdda9a7d..904530c44c227 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -26,10 +26,10 @@
 import requests
 from huggingface_hub import hf_hub_download
 from transformers import (
-    ConditionalDETRConfig,
-    ConditionalDETRFeatureExtractor,
-    ConditionalDETRForObjectDetection,
-    ConditionalDETRForSegmentation,
+    ConditionalDetrConfig,
+    ConditionalDetrFeatureExtractor,
+    ConditionalDetrForObjectDetection,
+    ConditionalDetrForSegmentation,
 )
 from transformers.utils import logging
 
@@ -226,7 +226,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
     """
 
     # load default config
-    config = ConditionalDETRConfig()
+    config = ConditionalDetrConfig()
     # set backbone and dilation attributes
     if "resnet101" in model_name:
         config.backbone = "resnet101"
@@ -246,7 +246,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
 
     # load feature extractor
     format = "coco_panoptic" if is_panoptic else "coco_detection"
-    feature_extractor = ConditionalDETRFeatureExtractor(format=format)
+    feature_extractor = ConditionalDetrFeatureExtractor(format=format)
 
     # prepare image
     img = prepare_img()
@@ -290,7 +290,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
                 val = state_dict.pop(key)
                 state_dict[prefix + key] = val
     # finally, create HuggingFace model and load state dict
-    model = ConditionalDETRForSegmentation(config) if is_panoptic else ConditionalDETRForObjectDetection(config)
+    model = ConditionalDetrForSegmentation(config) if is_panoptic else ConditionalDetrForObjectDetection(config)
     model.load_state_dict(state_dict)
     model.eval()
     model.push_to_hub(repo_id=model_name, organization="DepuMeng", commit_message="Add model")
diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index f37e3e1fb69bf..ec324f1f72456 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -119,7 +119,7 @@ def id_to_rgb(id_map):
     return color
 
 
-class ConditionalDETRFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
     Constructs a CONDITIONAL_DETR feature extractor.
 
@@ -431,11 +431,11 @@ def __call__(
             annotations (`Dict`, `List[Dict]`, *optional*):
                 The corresponding annotations in COCO format.
 
-                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_detection"`, the
+                In case [`ConditionalDetrFeatureExtractor`] was initialized with `format = "coco_detection"`, the
                 annotations for each image should have the following format: {'image_id': int, 'annotations':
                 [annotation]}, with the annotations being a list of COCO object annotations.
 
-                In case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the
+                In case [`ConditionalDetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the
                 annotations for each image should have the following format: {'image_id': int, 'file_name': str,
                 'segments_info': [segment_info]} with segments_info being a list of COCO panoptic annotations.
 
@@ -445,7 +445,7 @@ def __call__(
 
             masks_path (`pathlib.Path`, *optional*):
                 Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only
-                relevant in case [`ConditionalDETRFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
+                relevant in case [`ConditionalDetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
 
             pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether or not to pad images up to the largest image in a batch and create a pixel mask.
@@ -676,11 +676,11 @@ def pad_and_create_pixel_mask(
     # inspired by https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py#L258
     def post_process(self, outputs, target_sizes):
         """
-        Converts the output of [`ConditionalDETRForObjectDetection`] into the format expected by the COCO api. Only
+        Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the COCO api. Only
         supports PyTorch.
 
         Args:
-            outputs ([`ConditionalDETRObjectDetectionOutput`]):
+            outputs ([`ConditionalDetrObjectDetectionOutput`]):
                 Raw outputs of the model.
             target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                 Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
@@ -717,11 +717,11 @@ def post_process(self, outputs, target_sizes):
 
     def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
         """
-        Converts the output of [`ConditionalDETRForSegmentation`] into image segmentation predictions. Only supports
+        Converts the output of [`ConditionalDetrForSegmentation`] into image segmentation predictions. Only supports
         PyTorch.
 
         Parameters:
-            outputs ([`ConditionalDETRSegmentationOutput`]):
+            outputs ([`ConditionalDetrSegmentationOutput`]):
                 Raw outputs of the model.
             target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
                 Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
@@ -760,14 +760,14 @@ def to_tuple(tup):
     # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
     def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
         """
-        Converts the output of [`ConditionalDETRForSegmentation`] into actual instance segmentation predictions. Only
+        Converts the output of [`ConditionalDetrForSegmentation`] into actual instance segmentation predictions. Only
         supports PyTorch.
 
         Args:
             results (`List[Dict]`):
-                Results list obtained by [`~ConditionalDETRFeatureExtractor.post_process`], to which "masks" results
+                Results list obtained by [`~ConditionalDetrFeatureExtractor.post_process`], to which "masks" results
                 will be added.
-            outputs ([`ConditionalDETRSegmentationOutput`]):
+            outputs ([`ConditionalDetrSegmentationOutput`]):
                 Raw outputs of the model.
             orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                 Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
@@ -804,11 +804,11 @@ def post_process_instance(self, results, outputs, orig_target_sizes, max_target_
     # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
     def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
         """
-        Converts the output of [`ConditionalDETRForSegmentation`] into actual panoptic predictions. Only supports
+        Converts the output of [`ConditionalDetrForSegmentation`] into actual panoptic predictions. Only supports
         PyTorch.
 
         Parameters:
-            outputs ([`ConditionalDETRSegmentationOutput`]):
+            outputs ([`ConditionalDetrSegmentationOutput`]):
                 Raw outputs of the model.
             processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
                 Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 54dea544f3da9..05343a89f3ed0 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -39,7 +39,7 @@
     replace_return_docstrings,
     requires_backends,
 )
-from .configuration_conditional_detr import ConditionalDETRConfig
+from .configuration_conditional_detr import ConditionalDetrConfig
 
 
 if is_scipy_available():
@@ -53,7 +53,7 @@
 
 logger = logging.get_logger(__name__)
 
-_CONFIG_FOR_DOC = "ConditionalDETRConfig"
+_CONFIG_FOR_DOC = "ConditionalDetrConfig"
 _CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
 
 CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -63,7 +63,7 @@
 
 
 @dataclass
-class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
+class ConditionalDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
     """
     Base class for outputs of the Conditional DETR decoder. This class adds one attribute to
     BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output
@@ -95,7 +95,7 @@ class ConditionalDETRDecoderOutput(BaseModelOutputWithCrossAttentions):
 
 
 @dataclass
-class ConditionalDETRModelOutput(Seq2SeqModelOutput):
+class ConditionalDetrModelOutput(Seq2SeqModelOutput):
     """
     Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to
     Seq2SeqModelOutput, namely an optional stack of intermediate decoder activations, i.e. the output of each decoder
@@ -137,10 +137,10 @@ class ConditionalDETRModelOutput(Seq2SeqModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->ConditionalDETR
-class ConditionalDETRObjectDetectionOutput(ModelOutput):
+# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->ConditionalDetr
+class ConditionalDetrObjectDetectionOutput(ModelOutput):
     """
-    Output type of [`ConditionalDETRForObjectDetection`].
+    Output type of [`ConditionalDetrForObjectDetection`].
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
@@ -154,7 +154,7 @@ class ConditionalDETRObjectDetectionOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            possible padding). You can use [`~ConditionalDetrFeatureExtractor.post_process`] to retrieve the
             unnormalized bounding boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
@@ -201,10 +201,10 @@ class ConditionalDETRObjectDetectionOutput(ModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.detr.modeling_detr.DetrSegmentationOutput with Detr->ConditionalDETR
-class ConditionalDETRSegmentationOutput(ModelOutput):
+# Copied from transformers.models.detr.modeling_detr.DetrSegmentationOutput with Detr->ConditionalDetr
+class ConditionalDetrSegmentationOutput(ModelOutput):
     """
-    Output type of [`ConditionalDETRForSegmentation`].
+    Output type of [`ConditionalDetrForSegmentation`].
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
@@ -218,12 +218,12 @@ class ConditionalDETRSegmentationOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDETRFeatureExtractor.post_process`] to retrieve the
+            possible padding). You can use [`~ConditionalDetrFeatureExtractor.post_process`] to retrieve the
             unnormalized bounding boxes.
         pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
             Segmentation masks logits for all queries. See also
-            [`~ConditionalDETRFeatureExtractor.post_process_segmentation`] or
-            [`~ConditionalDETRFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation
+            [`~ConditionalDetrFeatureExtractor.post_process_segmentation`] or
+            [`~ConditionalDetrFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation
             masks respectively.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
@@ -272,8 +272,8 @@ class ConditionalDETRSegmentationOutput(ModelOutput):
 
 # BELOW: utilities copied from
 # https://github.com/facebookresearch/detr/blob/master/backbone.py
-# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDETR
-class ConditionalDETRFrozenBatchNorm2d(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDetr
+class ConditionalDetrFrozenBatchNorm2d(nn.Module):
     """
     BatchNorm2d where the batch statistics and the affine parameters are fixed.
 
@@ -282,7 +282,7 @@ class ConditionalDETRFrozenBatchNorm2d(nn.Module):
     """
 
     def __init__(self, n):
-        super(ConditionalDETRFrozenBatchNorm2d, self).__init__()
+        super(ConditionalDetrFrozenBatchNorm2d, self).__init__()
         self.register_buffer("weight", torch.ones(n))
         self.register_buffer("bias", torch.zeros(n))
         self.register_buffer("running_mean", torch.zeros(n))
@@ -295,7 +295,7 @@ def _load_from_state_dict(
         if num_batches_tracked_key in state_dict:
             del state_dict[num_batches_tracked_key]
 
-        super(ConditionalDETRFrozenBatchNorm2d, self)._load_from_state_dict(
+        super(ConditionalDetrFrozenBatchNorm2d, self)._load_from_state_dict(
             state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
         )
 
@@ -312,12 +312,12 @@ def forward(self, x):
         return x * scale + bias
 
 
-# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->ConditionalDETR
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->ConditionalDetr
 def replace_batch_norm(m, name=""):
     for attr_str in dir(m):
         target_attr = getattr(m, attr_str)
         if isinstance(target_attr, nn.BatchNorm2d):
-            frozen = ConditionalDETRFrozenBatchNorm2d(target_attr.num_features)
+            frozen = ConditionalDetrFrozenBatchNorm2d(target_attr.num_features)
             bn = getattr(m, attr_str)
             frozen.weight.data.copy_(bn.weight)
             frozen.bias.data.copy_(bn.bias)
@@ -328,11 +328,11 @@ def replace_batch_norm(m, name=""):
         replace_batch_norm(ch, n)
 
 
-class ConditionalDETRTimmConvEncoder(nn.Module):
+class ConditionalDetrTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.
 
-    nn.BatchNorm2d layers are replaced by ConditionalDETRFrozenBatchNorm2d as defined above.
+    nn.BatchNorm2d layers are replaced by ConditionalDetrFrozenBatchNorm2d as defined above.
 
     """
 
@@ -369,8 +369,8 @@ def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         return out
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->ConditionalDETR
-class ConditionalDETRConvModel(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->ConditionalDetr
+class ConditionalDetrConvModel(nn.Module):
     """
     This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
     """
@@ -391,7 +391,7 @@ def forward(self, pixel_values, pixel_mask):
         return out, pos
 
 
-# Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDETR
+# Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDetr
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
@@ -406,8 +406,8 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrSinePositionEmbedding with Detr->ConditionalDETR
-class ConditionalDETRSinePositionEmbedding(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrSinePositionEmbedding with Detr->ConditionalDetr
+class ConditionalDetrSinePositionEmbedding(nn.Module):
     """
     This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
     need paper, generalized to work on images.
@@ -444,8 +444,8 @@ def forward(self, pixel_values, pixel_mask):
         return pos
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->ConditionalDETR
-class ConditionalDETRLearnedPositionEmbedding(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->ConditionalDetr
+class ConditionalDetrLearnedPositionEmbedding(nn.Module):
     """
     This module learns positional embeddings up to a fixed maximum size.
     """
@@ -468,14 +468,14 @@ def forward(self, pixel_values, pixel_mask=None):
         return pos
 
 
-# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->ConditionalDETR
+# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->ConditionalDetr
 def build_position_encoding(config):
     n_steps = config.d_model // 2
     if config.position_embedding_type == "sine":
         # TODO find a better way of exposing other arguments
-        position_embedding = ConditionalDETRSinePositionEmbedding(n_steps, normalize=True)
+        position_embedding = ConditionalDetrSinePositionEmbedding(n_steps, normalize=True)
     elif config.position_embedding_type == "learned":
-        position_embedding = ConditionalDETRLearnedPositionEmbedding(n_steps)
+        position_embedding = ConditionalDetrLearnedPositionEmbedding(n_steps)
     else:
         raise ValueError(f"Not supported {config.position_embedding_type}")
 
@@ -634,7 +634,7 @@ def forward(
         return attn_output, attn_weights_reshaped
 
 
-class ConditionalDETRAttention(nn.Module):
+class ConditionalDetrAttention(nn.Module):
     """
     Cross-Attention used in Conditional DETR 'Conditional DETR for Fast Training Convergence' paper.
 
@@ -750,8 +750,8 @@ def forward(
         return attn_output, attn_weights_reshaped
 
 
-class ConditionalDETREncoderLayer(nn.Module):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrEncoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__()
         self.embed_dim = config.d_model
         self.self_attn = DetrAttention(
@@ -819,8 +819,8 @@ def forward(
         return outputs
 
 
-class ConditionalDETRDecoderLayer(nn.Module):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrDecoderLayer(nn.Module):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__()
         self.embed_dim = config.d_model
 
@@ -832,7 +832,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.sa_kpos_proj = nn.Linear(d_model, d_model)
         self.sa_v_proj = nn.Linear(d_model, d_model)
 
-        self.self_attn = ConditionalDETRAttention(
+        self.self_attn = ConditionalDetrAttention(
             embed_dim=self.embed_dim,
             out_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
@@ -852,7 +852,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.ca_v_proj = nn.Linear(d_model, d_model)
         self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)
 
-        self.encoder_attn = ConditionalDETRAttention(
+        self.encoder_attn = ConditionalDetrAttention(
             self.embed_dim * 2, self.embed_dim, config.decoder_attention_heads, dropout=config.attention_dropout
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
@@ -989,8 +989,8 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead with Detr->ConditionalDETR
-class ConditionalDETRClassificationHead(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead with Detr->ConditionalDetr
+class ConditionalDetrClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
 
     def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
@@ -1023,9 +1023,9 @@ def forward(self, x):
         return x
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->ConditionalDETR
-class ConditionalDETRPreTrainedModel(PreTrainedModel):
-    config_class = ConditionalDETRConfig
+# Copied from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->ConditionalDetr
+class ConditionalDetrPreTrainedModel(PreTrainedModel):
+    config_class = ConditionalDetrConfig
     base_model_prefix = "model"
     main_input_name = "pixel_values"
 
@@ -1033,12 +1033,12 @@ def _init_weights(self, module):
         std = self.config.init_std
         xavier_std = self.config.init_xavier_std
 
-        if isinstance(module, ConditionalDETRMHAttentionMap):
+        if isinstance(module, ConditionalDetrMHAttentionMap):
             nn.init.zeros_(module.k_linear.bias)
             nn.init.zeros_(module.q_linear.bias)
             nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
             nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
-        elif isinstance(module, ConditionalDETRLearnedPositionEmbedding):
+        elif isinstance(module, ConditionalDetrLearnedPositionEmbedding):
             nn.init.uniform_(module.row_embeddings.weight)
             nn.init.uniform_(module.column_embeddings.weight)
         if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
@@ -1053,7 +1053,7 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, ConditionalDETRDecoder):
+        if isinstance(module, ConditionalDetrDecoder):
             module.gradient_checkpointing = value
 
 
@@ -1067,7 +1067,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     and behavior.
 
     Parameters:
-        config ([`ConditionalDETRConfig`]):
+        config ([`ConditionalDetrConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1078,8 +1078,8 @@ def _set_gradient_checkpointing(self, module, value=False):
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it.
 
-            Pixel values can be obtained using [`ConditionalDETRFeatureExtractor`]. See
-            [`ConditionalDETRFeatureExtractor.__call__`] for details.
+            Pixel values can be obtained using [`ConditionalDetrFeatureExtractor`]. See
+            [`ConditionalDetrFeatureExtractor.__call__`] for details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -1112,11 +1112,11 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDETR
-class ConditionalDETREncoder(ConditionalDETRPreTrainedModel):
+# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDetr
+class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
-    [`ConditionalDETREncoderLayer`].
+    [`ConditionalDetrEncoderLayer`].
 
     The encoder updates the flattened feature map through multiple self-attention layers.
 
@@ -1125,16 +1125,16 @@ class ConditionalDETREncoder(ConditionalDETRPreTrainedModel):
     - position_embeddings are added to the forward pass.
 
     Args:
-        config: ConditionalDETRConfig
+        config: ConditionalDetrConfig
     """
 
-    def __init__(self, config: ConditionalDETRConfig):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         self.dropout = config.dropout
         self.layerdrop = config.encoder_layerdrop
 
-        self.layers = nn.ModuleList([ConditionalDETREncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layers = nn.ModuleList([ConditionalDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
 
         # in the original CONDITIONAL_DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
 
@@ -1222,9 +1222,9 @@ def forward(
         )
 
 
-class ConditionalDETRDecoder(ConditionalDETRPreTrainedModel):
+class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
     """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`ConditionalDETRDecoderLayer`].
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`ConditionalDetrDecoderLayer`].
 
     The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
 
@@ -1234,15 +1234,15 @@ class ConditionalDETRDecoder(ConditionalDETRPreTrainedModel):
     - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
 
     Args:
-        config: ConditionalDETRConfig
+        config: ConditionalDetrConfig
     """
 
-    def __init__(self, config: ConditionalDETRConfig):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
 
-        self.layers = nn.ModuleList([ConditionalDETRDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layers = nn.ModuleList([ConditionalDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
         # in CONDITIONAL_DETR, the decoder uses layernorm after the last decoder layer output
         self.layernorm = nn.LayerNorm(config.d_model)
         d_model = config.d_model
@@ -1425,7 +1425,7 @@ def custom_forward(*inputs):
                 ]
                 if v is not None
             )
-        return ConditionalDETRDecoderOutput(
+        return ConditionalDetrDecoderOutput(
             last_hidden_state=hidden_states,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
@@ -1442,22 +1442,22 @@ def custom_forward(*inputs):
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-class ConditionalDETRModel(ConditionalDETRPreTrainedModel):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrModel(ConditionalDetrPreTrainedModel):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # Create backbone + positional encoding
-        backbone = ConditionalDETRTimmConvEncoder(config.backbone, config.dilation)
+        backbone = ConditionalDetrTimmConvEncoder(config.backbone, config.dilation)
         position_embeddings = build_position_encoding(config)
-        self.backbone = ConditionalDETRConvModel(backbone, position_embeddings)
+        self.backbone = ConditionalDetrConvModel(backbone, position_embeddings)
 
         # Create projection layer
         self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
 
         self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
 
-        self.encoder = ConditionalDETREncoder(config)
-        self.decoder = ConditionalDETRDecoder(config)
+        self.encoder = ConditionalDetrEncoder(config)
+        self.decoder = ConditionalDetrDecoder(config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1477,7 +1477,7 @@ def unfreeze_backbone(self):
             param.requires_grad_(True)
 
     @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=ConditionalDETRModelOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=ConditionalDetrModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values,
@@ -1496,15 +1496,15 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRModel
+        >>> from transformers import ConditionalDetrFeatureExtractor, ConditionalDetrModel
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR")
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
@@ -1582,7 +1582,7 @@ def forward(
         if not return_dict:
             return decoder_outputs + encoder_outputs
 
-        return ConditionalDETRModelOutput(
+        return ConditionalDetrModelOutput(
             last_hidden_state=decoder_outputs.last_hidden_state,
             decoder_hidden_states=decoder_outputs.hidden_states,
             decoder_attentions=decoder_outputs.attentions,
@@ -1602,18 +1602,18 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-class ConditionalDETRForObjectDetection(ConditionalDETRPreTrainedModel):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # CONDITIONAL_DETR encoder-decoder model
-        self.model = ConditionalDETRModel(config)
+        self.model = ConditionalDetrModel(config)
 
         # Object detection heads
         self.class_labels_classifier = nn.Linear(
             config.d_model, config.num_labels
         )  # We add one for the "no object" class
-        self.bbox_predictor = ConditionalDETRMLPPredictionHead(
+        self.bbox_predictor = ConditionalDetrMLPPredictionHead(
             input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
         )
 
@@ -1629,7 +1629,7 @@ def _set_aux_loss(self, outputs_class, outputs_coord):
         return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
 
     @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=ConditionalDETRObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=ConditionalDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values,
@@ -1655,15 +1655,15 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForObjectDetection
+        >>> from transformers import ConditionalDetrFeatureExtractor, ConditionalDetrForObjectDetection
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1703,12 +1703,12 @@ def forward(
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
             # First: create the matcher
-            matcher = ConditionalDETRHungarianMatcher(
+            matcher = ConditionalDetrHungarianMatcher(
                 class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
             )
             # Second: create the criterion
             losses = ["labels", "boxes", "cardinality"]
-            criterion = ConditionalDETRLoss(
+            criterion = ConditionalDetrLoss(
                 matcher=matcher,
                 num_classes=self.config.num_labels,
                 focal_alpha=self.config.focal_alpha,
@@ -1751,7 +1751,7 @@ def forward(
                 output = (logits, pred_boxes) + outputs
             return ((loss, loss_dict) + output) if loss is not None else output
 
-        return ConditionalDETRObjectDetectionOutput(
+        return ConditionalDetrObjectDetectionOutput(
             loss=loss,
             loss_dict=loss_dict,
             logits=logits,
@@ -1775,22 +1775,22 @@ def forward(
     """,
     CONDITIONAL_DETR_START_DOCSTRING,
 )
-class ConditionalDETRForSegmentation(ConditionalDETRPreTrainedModel):
-    def __init__(self, config: ConditionalDETRConfig):
+class ConditionalDetrForSegmentation(ConditionalDetrPreTrainedModel):
+    def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # object detection model
-        self.conditional_detr = ConditionalDETRForObjectDetection(config)
+        self.conditional_detr = ConditionalDetrForObjectDetection(config)
 
         # segmentation head
         hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
         intermediate_channel_sizes = self.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes
 
-        self.mask_head = ConditionalDETRMaskHeadSmallConv(
+        self.mask_head = ConditionalDetrMaskHeadSmallConv(
             hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
         )
 
-        self.bbox_attention = ConditionalDETRMHAttentionMap(
+        self.bbox_attention = ConditionalDetrMHAttentionMap(
             hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
         )
 
@@ -1798,7 +1798,7 @@ def __init__(self, config: ConditionalDETRConfig):
         self.post_init()
 
     @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=ConditionalDETRSegmentationOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=ConditionalDetrSegmentationOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values,
@@ -1832,16 +1832,16 @@ def forward(
         >>> import torch
         >>> import numpy
 
-        >>> from transformers import ConditionalDETRFeatureExtractor, ConditionalDETRForSegmentation
+        >>> from transformers import ConditionalDetrFeatureExtractor, ConditionalDetrForSegmentation
         >>> from transformers.models.conditional_detr.feature_extraction_conditional_detr import rgb_to_id
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained(
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained(
         ...     "facebook/conditional_detr-resnet-50-panoptic"
         ... )
-        >>> model = ConditionalDETRForSegmentation.from_pretrained("facebook/conditional_detr-resnet-50-panoptic")
+        >>> model = ConditionalDetrForSegmentation.from_pretrained("facebook/conditional_detr-resnet-50-panoptic")
 
         >>> # prepare image for the model
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
@@ -1849,7 +1849,7 @@ def forward(
         >>> # forward pass
         >>> outputs = model(**inputs)
 
-        >>> # use the `post_process_panoptic` method of `ConditionalDETRFeatureExtractor` to convert to COCO format
+        >>> # use the `post_process_panoptic` method of `ConditionalDetrFeatureExtractor` to convert to COCO format
         >>> processed_sizes = torch.as_tensor(inputs["pixel_values"].shape[-2:]).unsqueeze(0)
         >>> result = feature_extractor.post_process_panoptic(outputs, processed_sizes)[0]
 
@@ -1947,12 +1947,12 @@ def forward(
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
             # First: create the matcher
-            matcher = ConditionalDETRHungarianMatcher(
+            matcher = ConditionalDetrHungarianMatcher(
                 class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
             )
             # Second: create the criterion
             losses = ["labels", "boxes", "cardinality", "masks"]
-            criterion = ConditionalDETRLoss(
+            criterion = ConditionalDetrLoss(
                 matcher=matcher,
                 num_classes=self.config.num_labels,
                 focal_alpha=self.config.focal_alpha,
@@ -1991,7 +1991,7 @@ def forward(
                 output = (logits, pred_boxes, pred_masks) + decoder_outputs + encoder_outputs
             return ((loss, loss_dict) + output) if loss is not None else output
 
-        return ConditionalDETRSegmentationOutput(
+        return ConditionalDetrSegmentationOutput(
             loss=loss,
             loss_dict=loss_dict,
             logits=logits,
@@ -2013,8 +2013,8 @@ def _expand(tensor, length: int):
 
 
 # taken from https://github.com/facebookresearch/detr/blob/master/models/segmentation.py
-# Copied from transformers.models.detr.modeling_detr.DetrMaskHeadSmallConv with Detr->ConditionalDETR
-class ConditionalDETRMaskHeadSmallConv(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrMaskHeadSmallConv with Detr->ConditionalDetr
+class ConditionalDetrMaskHeadSmallConv(nn.Module):
     """
     Simple convolutional head, using group norm. Upsampling is done using a FPN approach
     """
@@ -2094,8 +2094,8 @@ def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
         return x
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->ConditionalDETR
-class ConditionalDETRMHAttentionMap(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->ConditionalDetr
+class ConditionalDetrMHAttentionMap(nn.Module):
     """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
 
     def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
@@ -2173,16 +2173,16 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
 
 
 # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
-class ConditionalDETRLoss(nn.Module):
+class ConditionalDetrLoss(nn.Module):
     """
-    This class computes the losses for ConditionalDETRForObjectDetection/ConditionalDETRForSegmentation. The process
+    This class computes the losses for ConditionalDetrForObjectDetection/ConditionalDetrForSegmentation. The process
     happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2)
     we supervise each pair of matched ground-truth / prediction (supervise class and box).
 
 
 
     Args:
-        matcher (`ConditionalDETRHungarianMatcher`):
+        matcher (`ConditionalDetrHungarianMatcher`):
             Module able to compute a matching between targets and proposals.
         num_classes (`int`):
             Number of object categories, omitting the special no-object category.
@@ -2375,7 +2375,7 @@ def forward(self, outputs, targets):
 
 
 # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
-class ConditionalDETRMLPPredictionHead(nn.Module):
+class ConditionalDetrMLPPredictionHead(nn.Module):
     """
     Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
     height and width of a bounding box w.r.t. an image.
@@ -2397,7 +2397,7 @@ def forward(self, x):
 
 
 # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/matcher.py
-class ConditionalDETRHungarianMatcher(nn.Module):
+class ConditionalDetrHungarianMatcher(nn.Module):
     """
     This class computes an assignment between the targets and the predictions of the network.
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index f61b8dfdda282..d2ec5be33ceb8 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -24,7 +24,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class ConditionalDETRFeatureExtractor(metaclass=DummyObject):
+class ConditionalDetrFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
index b5e19fe005da7..2d21cafc68520 100644
--- a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -32,10 +32,10 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ConditionalDETRFeatureExtractor
+    from transformers import ConditionalDetrFeatureExtractor
 
 
-class ConditionalDETRFeatureExtractionTester(unittest.TestCase):
+class ConditionalDetrFeatureExtractionTester(unittest.TestCase):
     def __init__(
         self,
         parent,
@@ -74,7 +74,7 @@ def prepare_feat_extract_dict(self):
 
     def get_expected_values(self, image_inputs, batched=False):
         """
-        This function computes the expected height and width when providing images to ConditionalDETRFeatureExtractor,
+        This function computes the expected height and width when providing images to ConditionalDetrFeatureExtractor,
         assuming do_resize is set to True with a scalar size.
         """
         if not batched:
@@ -106,12 +106,12 @@ def get_expected_values(self, image_inputs, batched=False):
 
 @require_torch
 @require_vision
-class ConditionalDETRFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+class ConditionalDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
 
-    feature_extraction_class = ConditionalDETRFeatureExtractor if is_vision_available() else None
+    feature_extraction_class = ConditionalDetrFeatureExtractor if is_vision_available() else None
 
     def setUp(self):
-        self.feature_extract_tester = ConditionalDETRFeatureExtractionTester(self)
+        self.feature_extract_tester = ConditionalDetrFeatureExtractionTester(self)
 
     @property
     def feat_extract_dict(self):
@@ -253,7 +253,7 @@ def test_call_pytorch_with_coco_detection_annotations(self):
         target = {"image_id": 39769, "annotations": target}
 
         # encode them
-        feature_extractor = ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
         encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
 
         # verify pixel values
@@ -300,7 +300,7 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
 
         # encode them
         # TODO replace by .from_pretrained Atten4Vis/ConditionalDETR-panoptic
-        feature_extractor = ConditionalDETRFeatureExtractor(format="coco_panoptic")
+        feature_extractor = ConditionalDetrFeatureExtractor(format="coco_panoptic")
         encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
 
         # verify pixel values
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index 38933f26c6627..fd008b9af494f 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -19,7 +19,7 @@
 import math
 import unittest
 
-from transformers import ConditionalDETRConfig, is_timm_available, is_vision_available
+from transformers import ConditionalDetrConfig, is_timm_available, is_vision_available
 from transformers.testing_utils import require_timm, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
@@ -31,16 +31,16 @@
 if is_timm_available():
     import torch
 
-    from transformers import ConditionalDETRForObjectDetection, ConditionalDETRForSegmentation, ConditionalDETRModel
+    from transformers import ConditionalDetrForObjectDetection, ConditionalDetrForSegmentation, ConditionalDetrModel
 
 
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ConditionalDETRFeatureExtractor
+    from transformers import ConditionalDetrFeatureExtractor
 
 
-class ConditionalDETRModelTester:
+class ConditionalDetrModelTester:
     def __init__(
         self,
         parent,
@@ -105,7 +105,7 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
-        return ConditionalDETRConfig(
+        return ConditionalDetrConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
             decoder_layers=self.num_hidden_layers,
@@ -125,7 +125,7 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
     def create_and_check_conditional_detr_model(self, config, pixel_values, pixel_mask, labels):
-        model = ConditionalDETRModel(config=config)
+        model = ConditionalDetrModel(config=config)
         model.to(torch_device)
         model.eval()
 
@@ -137,7 +137,7 @@ def create_and_check_conditional_detr_model(self, config, pixel_values, pixel_ma
         )
 
     def create_and_check_conditional_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
-        model = ConditionalDETRForObjectDetection(config=config)
+        model = ConditionalDetrForObjectDetection(config=config)
         model.to(torch_device)
         model.eval()
 
@@ -155,12 +155,12 @@ def create_and_check_conditional_detr_object_detection_head_model(self, config,
 
 
 @require_timm
-class ConditionalDETRModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
-            ConditionalDETRModel,
-            ConditionalDETRForObjectDetection,
-            ConditionalDETRForSegmentation,
+            ConditionalDetrModel,
+            ConditionalDetrForObjectDetection,
+            ConditionalDetrForSegmentation,
         )
         if is_timm_available()
         else ()
@@ -176,7 +176,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class.__name__ in ["ConditionalDETRForObjectDetection", "ConditionalDETRForSegmentation"]:
+            if model_class.__name__ in ["ConditionalDetrForObjectDetection", "ConditionalDetrForSegmentation"]:
                 labels = []
                 for i in range(self.model_tester.batch_size):
                     target = {}
@@ -199,8 +199,8 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         return inputs_dict
 
     def setUp(self):
-        self.model_tester = ConditionalDETRModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ConditionalDETRConfig, has_text_modality=False)
+        self.model_tester = ConditionalDetrModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ConditionalDetrConfig, has_text_modality=False)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -279,10 +279,10 @@ def test_attention_outputs(self):
                 if "labels" in inputs_dict:
                     correct_outlen += 1  # loss is added to beginning
                 # Object Detection model returns pred_logits and pred_boxes
-                if model_class.__name__ == "ConditionalDETRForObjectDetection":
+                if model_class.__name__ == "ConditionalDetrForObjectDetection":
                     correct_outlen += 1
                 # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks
-                if model_class.__name__ == "ConditionalDETRForSegmentation":
+                if model_class.__name__ == "ConditionalDetrForSegmentation":
                     correct_outlen += 2
                 if "past_key_values" in outputs:
                     correct_outlen += 1  # past_key_values have been returned
@@ -406,7 +406,7 @@ def test_different_timm_backbone(self):
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
-            if model_class.__name__ == "ConditionalDETRForObjectDetection":
+            if model_class.__name__ == "ConditionalDetrForObjectDetection":
                 expected_shape = (
                     self.model_tester.batch_size,
                     self.model_tester.num_queries,
@@ -452,17 +452,17 @@ def prepare_img():
 @require_timm
 @require_vision
 @slow
-class ConditionalDETRModelIntegrationTests(unittest.TestCase):
+class ConditionalDetrModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
         return (
-            ConditionalDETRFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+            ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
             if is_vision_available()
             else None
         )
 
     def test_inference_no_head(self):
-        model = ConditionalDETRModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -479,7 +479,7 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_inference_object_detection_head(self):
-        model = ConditionalDETRForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -505,7 +505,7 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
     def test_inference_panoptic_segmentation_head(self):
-        model = ConditionalDETRForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
+        model = ConditionalDetrForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
diff --git a/utils/check_repo.py b/utils/check_repo.py
index ffc47aa72af8f..8659f795602e2 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -60,8 +60,8 @@
     "DetrEncoder",  # Building part of bigger (tested) model.
     "DetrDecoder",  # Building part of bigger (tested) model.
     "DetrDecoderWrapper",  # Building part of bigger (tested) model.
-    "ConditionalDETREncoder",  # Building part of bigger (tested) model.
-    "ConditionalDETRDecoder",  # Building part of bigger (tested) model.
+    "ConditionalDetrEncoder",  # Building part of bigger (tested) model.
+    "ConditionalDetrDecoder",  # Building part of bigger (tested) model.
     "M2M100Encoder",  # Building part of bigger (tested) model.
     "M2M100Decoder",  # Building part of bigger (tested) model.
     "MCTCTEncoder",  # Building part of bigger (tested) model.

From 9b512e1e6e0c3978b0c7d6d6b367e6b9a194d74f Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 11:23:00 -0400
Subject: [PATCH 185/233] fixed docs

---
 README.md                        | 1 -
 README_zh-hans.md                | 1 -
 README_zh-hant.md                | 1 -
 docs/source/en/index.mdx         | 3 +--
 docs/source/en/serialization.mdx | 2 +-
 5 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 1ba4614902013..ec8a0fd2e8b39 100644
--- a/README.md
+++ b/README.md
@@ -279,7 +279,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index b2bf8f6043e0d..88611a5f672bd 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -253,7 +253,6 @@ conda install -c huggingface transformers
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 发布。
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 366352ef5eb4e..b84d3ea8ca974 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -265,7 +265,6 @@ conda install -c huggingface transformers
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 579c6172d6a6b..40685a5c2fa8d 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -69,7 +69,6 @@ The documentation is organized into five sections:
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
@@ -217,7 +216,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|      conditional_detr       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|      Conditional DETR       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index f6304004b62ae..a1577447f7235 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -57,7 +57,7 @@ Ready-made configurations include the following architectures:
 - CamemBERT
 - CLIP
 - CodeGen
-- conditional_detr
+- Conditional DETR
 - ConvBERT
 - ConvNeXT
 - Data2VecText

From 9b6906a278e297d551a5e8b26a993ca651159a19 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 21:50:18 -0400
Subject: [PATCH 186/233] Update README_ko.md

---
 README_ko.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README_ko.md b/README_ko.md
index 51ad1d1b18693..8a2df2fcbc888 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -229,7 +229,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.

From d91f7ea7be60e68cee4313c5388d348c977d668f Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 12 Sep 2022 21:20:11 -0400
Subject: [PATCH 187/233] added spatial_model_name

---
 utils/check_copies.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/check_copies.py b/utils/check_copies.py
index 7d57173654468..0c8afa7b77ae8 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -477,6 +477,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "OpenAI GPT": "GPT",
     "Perceiver": "Perceiver IO",
     "ViT": "Vision Transformer (ViT)",
+    "conditional_detr": "Conditional DETR"
 }
 
 # Update this list with the models that shouldn't be in the README. This only concerns modular models or those who do

From 6a78fa1bdd8f4d439223bd881603d645e661fa22 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 12 Sep 2022 21:24:51 -0400
Subject: [PATCH 188/233] fixed fix-copies

---
 src/transformers/models/auto/configuration_auto.py | 2 +-
 utils/check_copies.py                              | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 258fb5d645bf1..dce73cb390365 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -302,7 +302,7 @@
         ("canine", "CANINE"),
         ("clip", "CLIP"),
         ("codegen", "CodeGen"),
-        ("conditional_detr", "conditional_detr"),
+        ("conditional_detr", "Conditional DETR"),
         ("convbert", "ConvBERT"),
         ("convnext", "ConvNeXT"),
         ("cpm", "CPM"),
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 0c8afa7b77ae8..7d57173654468 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -477,7 +477,6 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "OpenAI GPT": "GPT",
     "Perceiver": "Perceiver IO",
     "ViT": "Vision Transformer (ViT)",
-    "conditional_detr": "Conditional DETR"
 }
 
 # Update this list with the models that shouldn't be in the README. This only concerns modular models or those who do

From fbd461460e75cc12a4e5ce36d9dc67ec3006ef0e Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:44:17 -0400
Subject: [PATCH 189/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index ec324f1f72456..f072310f7f878 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for CONDITIONAL_DETR."""
+"""Feature extractor class for Conditional DETR."""
 
 import io
 import pathlib

From cf548b60bdc6fe619a2a00b4915201907114509c Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:44:58 -0400
Subject: [PATCH 190/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index f072310f7f878..bb000a7996b3c 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -37,7 +37,7 @@
 ImageInput = Union[Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"]]
 
 
-# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+# Copied from transformers.models.detr.feature_extraction_detr.center_to_corners_format
 def center_to_corners_format(x):
     """
     Converts a PyTorch tensor of bounding boxes of center format (center_x, center_y, width, height) to corners format

From a9e6cf8786390cfe82cf263b1bc050e483f21401 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:45:19 -0400
Subject: [PATCH 191/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index bb000a7996b3c..d89f3154e0bf0 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -48,6 +48,7 @@ def center_to_corners_format(x):
     return torch.stack(b, dim=-1)
 
 
+# Copied from transformers.models.detr.feature_extraction_detr.corners_to_center_format
 def corners_to_center_format(x):
     """
     Converts a NumPy array of bounding boxes of shape (number of bounding boxes, 4) of corners format (x_0, y_0, x_1,

From 0ca32a9774f9e282d914f3523c3c3c4a1678017c Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:45:43 -0400
Subject: [PATCH 192/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index d89f3154e0bf0..d813ab2a45e3c 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -60,6 +60,7 @@ def corners_to_center_format(x):
     return np.stack(b, axis=-1)
 
 
+# Copied from transformers.models.detr.feature_extraction_detr.masks_to_boxes
 def masks_to_boxes(masks):
     """
     Compute the bounding boxes around the provided panoptic segmentation masks.

From f898c9b6e940f3a2302a1df3fbf96187f6c097f5 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:46:05 -0400
Subject: [PATCH 193/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 05343a89f3ed0..3bf47082f6b30 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Attn4Vis and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 Microsoft Research Asia and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From e7c120f961dfc0b862a83f803aaa27605b2e8a03 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:46:25 -0400
Subject: [PATCH 194/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 3bf47082f6b30..7d45259b60adc 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -57,7 +57,7 @@
 _CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
 
 CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "Atten4Vis/ConditionalDETR",
+    "microsoft/conditional-detr-resnet-50",
     # See all conditional_detr models at https://huggingface.co/models?filter=conditional_detr
 ]
 

From ec3e486cf8692414c858f53a9e1337a6b6fbc758 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:51:20 -0400
Subject: [PATCH 195/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 7d45259b60adc..bf4d82479375e 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -270,8 +270,6 @@ class ConditionalDetrSegmentationOutput(ModelOutput):
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-# BELOW: utilities copied from
-# https://github.com/facebookresearch/detr/blob/master/backbone.py
 # Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->ConditionalDetr
 class ConditionalDetrFrozenBatchNorm2d(nn.Module):
     """

From e12b9243dea1c399055642036111ce9cfdf3e836 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:52:46 -0400
Subject: [PATCH 196/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index bf4d82479375e..8550bba968802 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -326,6 +326,7 @@ def replace_batch_norm(m, name=""):
         replace_batch_norm(ch, n)
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrTimmConvEncoder
 class ConditionalDetrTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.

From 24576f546098df8c3148af83195408e68d74ac0d Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:54:05 -0400
Subject: [PATCH 197/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 8550bba968802..415f0d47a41f5 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -503,6 +503,7 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrAttention with DETR->Conditional DETR
 class DetrAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.

From aa415560582cebce0f917bcffe61056b24edfdb4 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:56:48 -0400
Subject: [PATCH 198/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 415f0d47a41f5..be07eca130e23 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1112,7 +1112,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL_DETR,Detr->ConditionalDetr
+# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL DETR,Detr->ConditionalDetr
 class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a

From 3be87d54fc0bb3f3bc8ed4e23a6a5a226da3796d Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:57:04 -0400
Subject: [PATCH 199/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index be07eca130e23..00faac56fa1de 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1228,7 +1228,7 @@ class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
 
     The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
 
-    Some small tweaks for CONDITIONAL_DETR:
+    Some small tweaks for Conditional DETR:
 
     - position_embeddings and query_position_embeddings are added to the forward pass.
     - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.

From c60fe6d86a4a497cbb0271a3807633c92f737dfd Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:57:19 -0400
Subject: [PATCH 200/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 00faac56fa1de..849502de9323a 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1243,7 +1243,7 @@ def __init__(self, config: ConditionalDetrConfig):
         self.layerdrop = config.decoder_layerdrop
 
         self.layers = nn.ModuleList([ConditionalDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
-        # in CONDITIONAL_DETR, the decoder uses layernorm after the last decoder layer output
+        # in Conditional DETR, the decoder uses layernorm after the last decoder layer output
         self.layernorm = nn.LayerNorm(config.d_model)
         d_model = config.d_model
         self.gradient_checkpointing = False

From af40dce0e9c056946f07aaf936d90c627b7e92ef Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:57:42 -0400
Subject: [PATCH 201/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 849502de9323a..a063c1c2e46e4 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1437,7 +1437,7 @@ def custom_forward(*inputs):
 
 @add_start_docstrings(
     """
-    The bare CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+    The bare Conditional DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
     hidden-states without any specific head on top.
     """,
     CONDITIONAL_DETR_START_DOCSTRING,

From e0e586ce84bd71a55d6a01bdaddbbc5d1a4be7bc Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 10:26:13 -0400
Subject: [PATCH 202/233] added some copied from

---
 .../feature_extraction_conditional_detr.py    | 29 +++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index d813ab2a45e3c..1421e82aa789a 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -94,9 +94,7 @@ def masks_to_boxes(masks):
     return np.stack([x_min, y_min, x_max, y_max], 1)
 
 
-# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
-# Copyright (c) 2018, Alexander Kirillov
-# All rights reserved.
+# Copied from transformers.models.detr.feature_extraction_detr.rgb_to_id
 def rgb_to_id(color):
     if isinstance(color, np.ndarray) and len(color.shape) == 3:
         if color.dtype == np.uint8:
@@ -104,7 +102,7 @@ def rgb_to_id(color):
         return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
-
+# Copied from transformers.models.detr.feature_extraction_detr.id_to_rgb
 def id_to_rgb(id_map):
     if isinstance(id_map, np.ndarray):
         id_map_copy = id_map.copy()
@@ -123,7 +121,7 @@ def id_to_rgb(id_map):
 
 class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
-    Constructs a CONDITIONAL_DETR feature extractor.
+    Constructs a CONDITIONAL DETR feature extractor.
 
     This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
     should refer to this superclass for more information regarding those methods.
@@ -188,7 +186,7 @@ def prepare(self, image, target, return_segmentation_masks=False, masks_path=Non
         else:
             raise ValueError(f"Format {self.format} not supported")
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.convert_coco_poly_to_mask
     def convert_coco_poly_to_mask(self, segmentations, height, width):
 
         try:
@@ -212,10 +210,10 @@ def convert_coco_poly_to_mask(self, segmentations, height, width):
 
         return masks
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_detection with DETR->Conditional DETR
     def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
         """
-        Convert the target in COCO format into the format expected by CONDITIONAL_DETR.
+        Convert the target in COCO format into the format expected by CONDITIONAL DETR.
         """
         w, h = image.size
 
@@ -277,6 +275,7 @@ def prepare_coco_detection(self, image, target, return_segmentation_masks=False)
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_panoptic
     def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
         w, h = image.size
         ann_info = target.copy()
@@ -310,6 +309,7 @@ def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
     def _resize(self, image, size, target=None, max_size=None):
         """
         Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
@@ -380,6 +380,7 @@ def get_size(image_size, size, max_size=None):
 
         return rescaled_image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
     def _normalize(self, image, mean, std, target=None):
         """
         Normalize the image with a certain mean and std.
@@ -401,7 +402,8 @@ def _normalize(self, image, mean, std, target=None):
             target["boxes"] = boxes
 
         return image, target
-
+    
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._call with DETR->Conditional DETR,Detr->ConditionalDetr
     def __call__(
         self,
         images: ImageInput,
@@ -549,7 +551,7 @@ def __call__(
             if annotations is not None:
                 annotations = [annotations]
 
-        # prepare (COCO annotations as a list of Dict -> CONDITIONAL_DETR target as a single Dict per image)
+        # prepare (COCO annotations as a list of Dict -> CONDITIONAL DETR target as a single Dict per image)
         if annotations is not None:
             for idx, (image, target) in enumerate(zip(images, annotations)):
                 if not isinstance(image, Image.Image):
@@ -624,6 +626,7 @@ def __call__(
 
         return encoded_inputs
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
     def _max_by_axis(self, the_list):
         # type: (List[List[int]]) -> List[int]
         maxes = the_list[0]
@@ -632,6 +635,7 @@ def _max_by_axis(self, the_list):
                 maxes[index] = max(maxes[index], item)
         return maxes
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.pad_and_create_pixel_mask
     def pad_and_create_pixel_mask(
         self, pixel_values_list: List["torch.Tensor"], return_tensors: Optional[Union[str, TensorType]] = None
     ):
@@ -717,6 +721,7 @@ def post_process(self, outputs, target_sizes):
 
         return results
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_segmentation with Detr->ConditionalDetr
     def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
         """
         Converts the output of [`ConditionalDetrForSegmentation`] into image segmentation predictions. Only supports
@@ -759,7 +764,7 @@ def to_tuple(tup):
             preds.append(predictions)
         return preds
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_instance with Detr->ConditionalDetr
     def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
         """
         Converts the output of [`ConditionalDetrForSegmentation`] into actual instance segmentation predictions. Only
@@ -803,7 +808,7 @@ def post_process_instance(self, results, outputs, orig_target_sizes, max_target_
 
         return results
 
-    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_panoptic with Detr->ConditionalDetr
     def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
         """
         Converts the output of [`ConditionalDetrForSegmentation`] into actual panoptic predictions. Only supports

From 9c79f40b777d55da3064c657a7ef55741237d1bb Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 10:27:39 -0400
Subject: [PATCH 203/233] added some copied from

---
 .../conditional_detr/feature_extraction_conditional_detr.py   | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 1421e82aa789a..812a0fe103cea 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -309,7 +309,6 @@ def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
 
         return image, target
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
     def _resize(self, image, size, target=None, max_size=None):
         """
         Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
@@ -380,7 +379,6 @@ def get_size(image_size, size, max_size=None):
 
         return rescaled_image, target
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
     def _normalize(self, image, mean, std, target=None):
         """
         Normalize the image with a certain mean and std.
@@ -403,7 +401,6 @@ def _normalize(self, image, mean, std, target=None):
 
         return image, target
     
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._call with DETR->Conditional DETR,Detr->ConditionalDetr
     def __call__(
         self,
         images: ImageInput,
@@ -626,7 +623,6 @@ def __call__(
 
         return encoded_inputs
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
     def _max_by_axis(self, the_list):
         # type: (List[List[int]]) -> List[int]
         maxes = the_list[0]

From 0793156c9a9191f1dbf6e98eac48ebb3025ef57b Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 11:40:45 -0400
Subject: [PATCH 204/233] added some copied from

---
 .../feature_extraction_conditional_detr.py    |  3 +-
 .../modeling_conditional_detr.py              | 52 +++++++++++--------
 ...est_feature_extraction_conditional_detr.py |  4 +-
 .../test_modeling_conditional_detr.py         |  8 +--
 4 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 812a0fe103cea..c9f5d8124e539 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -210,10 +210,9 @@ def convert_coco_poly_to_mask(self, segmentations, height, width):
 
         return masks
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_detection with DETR->Conditional DETR
     def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
         """
-        Convert the target in COCO format into the format expected by CONDITIONAL DETR.
+        Convert the target in COCO format into the format expected by Conditional DETR.
         """
         w, h = image.size
 
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index a063c1c2e46e4..435a80b220b3b 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -54,11 +54,11 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "ConditionalDetrConfig"
-_CHECKPOINT_FOR_DOC = "Atten4Vis/ConditionalDETR"
+_CHECKPOINT_FOR_DOC = "microsoft/conditional-detr-resnet-50"
 
 CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "microsoft/conditional-detr-resnet-50",
-    # See all conditional_detr models at https://huggingface.co/models?filter=conditional_detr
+    # See all Conditional DETR models at https://huggingface.co/models?filter=conditional_detr
 ]
 
 
@@ -331,11 +331,11 @@ class ConditionalDetrTimmConvEncoder(nn.Module):
     """
     Convolutional encoder (backbone) from the timm library.
 
-    nn.BatchNorm2d layers are replaced by ConditionalDetrFrozenBatchNorm2d as defined above.
+    nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.
 
     """
 
-    def __init__(self, name: str, dilation: bool):
+    def __init__(self, name: str, dilation: bool, use_pretrained_backbone: bool, num_channels: int = 3):
         super().__init__()
 
         kwargs = {}
@@ -344,7 +344,14 @@ def __init__(self, name: str, dilation: bool):
 
         requires_backends(self, ["timm"])
 
-        backbone = create_model(name, pretrained=True, features_only=True, out_indices=(1, 2, 3, 4), **kwargs)
+        backbone = create_model(
+            name,
+            pretrained=use_pretrained_backbone,
+            features_only=True,
+            out_indices=(1, 2, 3, 4),
+            in_chans=num_channels,
+            **kwargs,
+        )
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
@@ -482,7 +489,7 @@ def build_position_encoding(config):
 
 
 # function to generate sine positional embedding for 2d coordinates
-def gen_sineembed_for_position(pos_tensor):
+def gen_sine_position_embeddings(pos_tensor):
     scale = 2 * math.pi
     dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
     dim_t = 10000 ** (2 * (dim_t // 2) / 128)
@@ -503,12 +510,11 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrAttention with DETR->Conditional DETR
 class DetrAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
 
-    Here, we add position embeddings to the queries and keys (as explained in the CONDITIONAL_DETR paper).
+    Here, we add position embeddings to the queries and keys (as explained in the Conditional DETR paper).
     """
 
     def __init__(
@@ -1112,7 +1118,6 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->CONDITIONAL DETR,Detr->ConditionalDetr
 class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
@@ -1120,7 +1125,7 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
 
     The encoder updates the flattened feature map through multiple self-attention layers.
 
-    Small tweak for CONDITIONAL_DETR:
+    Small tweak for Conditional DETR:
 
     - position_embeddings are added to the forward pass.
 
@@ -1136,7 +1141,7 @@ def __init__(self, config: ConditionalDetrConfig):
 
         self.layers = nn.ModuleList([ConditionalDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
 
-        # in the original CONDITIONAL_DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
+        # in the original Conditional DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1341,7 +1346,7 @@ def forward(
         reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
         obj_center = reference_points[..., :2].transpose(0, 1)
         # get sine embedding for the query vector
-        query_sine_embed_before_transformation = gen_sineembed_for_position(obj_center)
+        query_sine_embed_before_transformation = gen_sine_position_embeddings(obj_center)
 
         for idx, decoder_layer in enumerate(self.layers):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1503,8 +1508,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
+        >>> model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50")
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
@@ -1606,7 +1611,7 @@ class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
     def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
-        # CONDITIONAL_DETR encoder-decoder model
+        # CONDITIONAL DETR encoder-decoder model
         self.model = ConditionalDetrModel(config)
 
         # Object detection heads
@@ -1662,8 +1667,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
-        >>> model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR")
+        >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
+        >>> model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -2374,7 +2379,7 @@ def forward(self, outputs, targets):
         return losses
 
 
-# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->ConditionalDetr
 class ConditionalDetrMLPPredictionHead(nn.Module):
     """
     Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
@@ -2396,7 +2401,6 @@ def forward(self, x):
         return x
 
 
-# taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/matcher.py
 class ConditionalDetrHungarianMatcher(nn.Module):
     """
     This class computes an assignment between the targets and the predictions of the network.
@@ -2421,7 +2425,7 @@ def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float
         self.class_cost = class_cost
         self.bbox_cost = bbox_cost
         self.giou_cost = giou_cost
-        if class_cost == 0 or bbox_cost == 0 or giou_cost == 0:
+        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
             raise ValueError("All costs of the Matcher can't be 0")
 
     @torch.no_grad()
@@ -2488,6 +2492,7 @@ def _upcast(t: Tensor) -> Tensor:
         return t if t.dtype in (torch.int32, torch.int64) else t.int()
 
 
+# Copied from transformers.models.detr.modeling_detr.box_area
 def box_area(boxes: Tensor) -> Tensor:
     """
     Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
@@ -2504,7 +2509,7 @@ def box_area(boxes: Tensor) -> Tensor:
     return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
 
 
-# modified from torchvision to also return the union
+# Copied from transformers.models.detr.modeling_detr.box_iou
 def box_iou(boxes1, boxes2):
     area1 = box_area(boxes1)
     area2 = box_area(boxes2)
@@ -2521,6 +2526,7 @@ def box_iou(boxes1, boxes2):
     return iou, union
 
 
+# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
 def generalized_box_iou(boxes1, boxes2):
     """
     Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
@@ -2554,7 +2560,7 @@ def _max_by_axis(the_list):
             maxes[index] = max(maxes[index], item)
     return maxes
 
-
+# Copied from transformers.models.detr.modeling_detr.NestedTensor
 class NestedTensor(object):
     def __init__(self, tensors, mask: Optional[Tensor]):
         self.tensors = tensors
@@ -2575,7 +2581,7 @@ def decompose(self):
     def __repr__(self):
         return str(self.tensors)
 
-
+# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
 def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     if tensor_list[0].ndim == 3:
         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
index 2d21cafc68520..371d97de0000b 100644
--- a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -253,7 +253,7 @@ def test_call_pytorch_with_coco_detection_annotations(self):
         target = {"image_id": 39769, "annotations": target}
 
         # encode them
-        feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+        feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
         encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
 
         # verify pixel values
@@ -299,7 +299,7 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
         masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
 
         # encode them
-        # TODO replace by .from_pretrained Atten4Vis/ConditionalDETR-panoptic
+        # TODO replace by .from_pretrained microsoft/conditional-detr-resnet-50-panoptic
         feature_extractor = ConditionalDetrFeatureExtractor(format="coco_panoptic")
         encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
 
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index fd008b9af494f..2fb4bb562d5bf 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -456,13 +456,13 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
         return (
-            ConditionalDetrFeatureExtractor.from_pretrained("Atten4Vis/ConditionalDETR")
+            ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
             if is_vision_available()
             else None
         )
 
     def test_inference_no_head(self):
-        model = ConditionalDetrModel.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -479,7 +479,7 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_inference_object_detection_head(self):
-        model = ConditionalDetrForObjectDetection.from_pretrained("Atten4Vis/ConditionalDETR").to(torch_device)
+        model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -505,7 +505,7 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
     def test_inference_panoptic_segmentation_head(self):
-        model = ConditionalDetrForSegmentation.from_pretrained("Atten4Vis/ConditionalDETR-panoptic").to(torch_device)
+        model = ConditionalDetrForSegmentation.from_pretrained("microsoft/conditional-detr-resnet-50-panoptic").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()

From 0c84ed88deeab34af3e6f5ce76d7a8e5de918204 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 11:43:10 -0400
Subject: [PATCH 205/233] added some copied from

---
 .../feature_extraction_conditional_detr.py                | 3 ++-
 .../models/conditional_detr/modeling_conditional_detr.py  | 2 ++
 .../conditional_detr/test_modeling_conditional_detr.py    | 8 ++++++--
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index c9f5d8124e539..922cfaf09e98e 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -102,6 +102,7 @@ def rgb_to_id(color):
         return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
+
 # Copied from transformers.models.detr.feature_extraction_detr.id_to_rgb
 def id_to_rgb(id_map):
     if isinstance(id_map, np.ndarray):
@@ -399,7 +400,7 @@ def _normalize(self, image, mean, std, target=None):
             target["boxes"] = boxes
 
         return image, target
-    
+
     def __call__(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 435a80b220b3b..a24a306b65e9f 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -2560,6 +2560,7 @@ def _max_by_axis(the_list):
             maxes[index] = max(maxes[index], item)
     return maxes
 
+
 # Copied from transformers.models.detr.modeling_detr.NestedTensor
 class NestedTensor(object):
     def __init__(self, tensors, mask: Optional[Tensor]):
@@ -2581,6 +2582,7 @@ def decompose(self):
     def __repr__(self):
         return str(self.tensors)
 
+
 # Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
 def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     if tensor_list[0].ndim == 3:
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index 2fb4bb562d5bf..a67f0bc30638e 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -479,7 +479,9 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_inference_object_detection_head(self):
-        model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)
+        model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50").to(
+            torch_device
+        )
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -505,7 +507,9 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
     def test_inference_panoptic_segmentation_head(self):
-        model = ConditionalDetrForSegmentation.from_pretrained("microsoft/conditional-detr-resnet-50-panoptic").to(torch_device)
+        model = ConditionalDetrForSegmentation.from_pretrained("microsoft/conditional-detr-resnet-50-panoptic").to(
+            torch_device
+        )
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()

From ade3e4c06c80a42dde7ff9f60059b9ce0d0b82a3 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 14:52:49 -0400
Subject: [PATCH 206/233] fixed use_pretrained issue

---
 .../models/conditional_detr/configuration_conditional_detr.py | 4 ++++
 .../models/conditional_detr/modeling_conditional_detr.py      | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 6abcf85894735..df6673e9ebfef 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -134,6 +134,7 @@ class ConditionalDetrConfig(PretrainedConfig):
 
     def __init__(
         self,
+        num_channels=3,
         num_queries=300,
         max_position_embeddings=1024,
         encoder_layers=6,
@@ -157,6 +158,7 @@ def __init__(
         auxiliary_loss=False,
         position_embedding_type="sine",
         backbone="resnet50",
+        use_pretrained_backbone=True,
         dilation=False,
         class_cost=2,
         bbox_cost=5,
@@ -169,6 +171,7 @@ def __init__(
         focal_alpha=0.25,
         **kwargs
     ):
+        self.num_channels = num_channels
         self.num_queries = num_queries
         self.max_position_embeddings = max_position_embeddings
         self.d_model = d_model
@@ -191,6 +194,7 @@ def __init__(
         self.auxiliary_loss = auxiliary_loss
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
         self.dilation = dilation
         # Hungarian matcher
         self.class_cost = class_cost
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index a24a306b65e9f..9e70e1b873cd6 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1452,7 +1452,9 @@ def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # Create backbone + positional encoding
-        backbone = ConditionalDetrTimmConvEncoder(config.backbone, config.dilation)
+        backbone = ConditionalDetrTimmConvEncoder(
+            config.backbone, config.dilation, config.use_pretrained_backbone, config.num_channels
+        )
         position_embeddings = build_position_encoding(config)
         self.backbone = ConditionalDetrConvModel(backbone, position_embeddings)
 

From 1ba0ff0c1732bd8f4345f44f685ffe93671a21cf Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 16 Sep 2022 10:35:52 -0400
Subject: [PATCH 207/233] changed post-process

---
 .../conditional_detr/feature_extraction_conditional_detr.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 922cfaf09e98e..be85334d7b354 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -701,7 +701,7 @@ def post_process(self, outputs, target_sizes):
             raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
 
         prob = out_logits.sigmoid()
-        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 300, dim=1)
         scores = topk_values
         topk_boxes = topk_indexes // out_logits.shape[2]
         labels = topk_indexes % out_logits.shape[2]

From af4067195922bda656f743fb81406197f618b915 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:47:29 -0400
Subject: [PATCH 208/233] added conditional_detr files

---
 src/transformers/__init__.py                               | 7 +++++++
 src/transformers/models/auto/feature_extraction_auto.py    | 2 ++
 src/transformers/models/auto/modeling_auto.py              | 1 +
 .../models/conditional_detr/modeling_conditional_detr.py   | 2 --
 4 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 6abc53c85008e..639d33c4754be 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3553,6 +3553,13 @@
             DeformableDetrModel,
             DeformableDetrPreTrainedModel,
         )
+        from .models.conditional_detr import (
+            CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ConditionalDETRForObjectDetection,
+            ConditionalDETRForSegmentation,
+            ConditionalDETRModel,
+            ConditionalDETRPreTrainedModel,
+        )
         from .models.detr import (
             DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
             DetrForObjectDetection,
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 015fd132ef0dc..9ba8295b406da 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,6 +39,8 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
+        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
+        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 8ee36125de25d..ca4d2d8876f01 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -459,6 +459,7 @@
         # Model for Object Detection mapping
         ("conditional_detr", "ConditionalDetrForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
+        ("conditional_detr", "ConditionalDETRForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
     ]
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 9e70e1b873cd6..ba958b6af9040 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -502,7 +502,6 @@ def gen_sine_position_embeddings(pos_tensor):
     pos = torch.cat((pos_y, pos_x), dim=2)
     return pos
 
-
 def inverse_sigmoid(x, eps=1e-5):
     x = x.clamp(min=0, max=1)
     x1 = x.clamp(min=eps)
@@ -679,7 +678,6 @@ def __init__(
 
     def _qk_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
     def _v_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim).transpose(1, 2).contiguous()
 

From ad01fc1267550d6fc9e04f3ef929efb176b91273 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 00:54:36 -0400
Subject: [PATCH 209/233] checked copies

---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index ba958b6af9040..dd501e8fe4a1c 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1695,7 +1695,7 @@ def forward(
 
         # class logits + predicted bounding boxes
         logits = self.class_labels_classifier(sequence_output)
-
+        
         reference = outputs.reference_points if return_dict else outputs[-1]
         reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
         outputs_coords = []

From 888153523178925d7fdcb373f85d05da0b38cd9d Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 01:22:01 -0400
Subject: [PATCH 210/233] fixed style and copies

---
 docs/source/en/_toctree.yml                                 | 6 +++---
 .../models/conditional_detr/modeling_conditional_detr.py    | 4 +++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index a4cd1005e3e83..d158be23cea58 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -42,7 +42,8 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - sections:
+    - isExpanded: false
+      sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -58,7 +59,6 @@
       - local: tasks/multiple_choice
         title: Multiple choice
       title: Task guides
-      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
@@ -503,4 +503,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
\ No newline at end of file
+  title: API
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index dd501e8fe4a1c..9e70e1b873cd6 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -502,6 +502,7 @@ def gen_sine_position_embeddings(pos_tensor):
     pos = torch.cat((pos_y, pos_x), dim=2)
     return pos
 
+
 def inverse_sigmoid(x, eps=1e-5):
     x = x.clamp(min=0, max=1)
     x1 = x.clamp(min=eps)
@@ -678,6 +679,7 @@ def __init__(
 
     def _qk_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
     def _v_shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim).transpose(1, 2).contiguous()
 
@@ -1695,7 +1697,7 @@ def forward(
 
         # class logits + predicted bounding boxes
         logits = self.class_labels_classifier(sequence_output)
-        
+
         reference = outputs.reference_points if return_dict else outputs[-1]
         reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
         outputs_coords = []

From b78443b24f2bee5232ba7e4b7c54dc535573d7b3 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 10:48:44 -0400
Subject: [PATCH 211/233] fixed some doc issue

---
 docs/source/en/_toctree.yml                             | 6 +++---
 src/transformers/models/auto/feature_extraction_auto.py | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index d158be23cea58..dfbdd503e6005 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -42,8 +42,7 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - isExpanded: false
-      sections:
+    - sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -58,7 +57,8 @@
         title: Summarization
       - local: tasks/multiple_choice
         title: Multiple choice
-      title: Task guides
+      title: Task guides      
+      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 9ba8295b406da..015fd132ef0dc 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,8 +39,6 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
-        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
-        ("conditional_detr", "ConditionalDETRFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),

From 0963e5e20fb7304b3ba2a649f9bee1d8d4b0b555 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 11:19:56 -0400
Subject: [PATCH 212/233] changed prefix to ConditionalDetr

---
 README.md                    | 1 +
 docs/source/en/_toctree.yml  | 4 ++--
 docs/source/en/index.mdx     | 1 +
 src/transformers/__init__.py | 8 ++++----
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index ec8a0fd2e8b39..1ba4614902013 100644
--- a/README.md
+++ b/README.md
@@ -279,6 +279,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index dfbdd503e6005..a4cd1005e3e83 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -57,7 +57,7 @@
         title: Summarization
       - local: tasks/multiple_choice
         title: Multiple choice
-      title: Task guides      
+      title: Task guides
       isExpanded: false
     title: Natural Language Processing
   - sections:
@@ -503,4 +503,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
+  title: API
\ No newline at end of file
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 40685a5c2fa8d..a705cb447248c 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -69,6 +69,7 @@ The documentation is organized into five sections:
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 639d33c4754be..d2efd10af9141 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3555,10 +3555,10 @@
         )
         from .models.conditional_detr import (
             CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ConditionalDETRForObjectDetection,
-            ConditionalDETRForSegmentation,
-            ConditionalDETRModel,
-            ConditionalDETRPreTrainedModel,
+            ConditionalDetrForObjectDetection,
+            ConditionalDetrForSegmentation,
+            ConditionalDetrModel,
+            ConditionalDetrPreTrainedModel,
         )
         from .models.detr import (
             DETR_PRETRAINED_MODEL_ARCHIVE_LIST,

From 449ffc3cf1d52d434d3ad0251c8c42acffe5a943 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Fri, 9 Sep 2022 11:23:00 -0400
Subject: [PATCH 213/233] fixed docs

---
 README.md                | 1 -
 docs/source/en/index.mdx | 1 -
 2 files changed, 2 deletions(-)

diff --git a/README.md b/README.md
index 1ba4614902013..ec8a0fd2e8b39 100644
--- a/README.md
+++ b/README.md
@@ -279,7 +279,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](https://huggingface.co/docs/transformers/main/model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index a705cb447248c..40685a5c2fa8d 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -69,7 +69,6 @@ The documentation is organized into five sections:
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[conditional_detr](model_doc/conditional_detr)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.

From 8629564439e208818dfef4b3b203af261c2b5ea3 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 12 Sep 2022 21:20:11 -0400
Subject: [PATCH 214/233] added spatial_model_name

---
 utils/check_copies.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/check_copies.py b/utils/check_copies.py
index 7d57173654468..0c8afa7b77ae8 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -477,6 +477,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "OpenAI GPT": "GPT",
     "Perceiver": "Perceiver IO",
     "ViT": "Vision Transformer (ViT)",
+    "conditional_detr": "Conditional DETR"
 }
 
 # Update this list with the models that shouldn't be in the README. This only concerns modular models or those who do

From b57116eb2966cd86ae3884b36b2ed7971e431e3c Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 12 Sep 2022 21:24:51 -0400
Subject: [PATCH 215/233] fixed fix-copies

---
 utils/check_copies.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/check_copies.py b/utils/check_copies.py
index 0c8afa7b77ae8..7d57173654468 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -477,7 +477,6 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "OpenAI GPT": "GPT",
     "Perceiver": "Perceiver IO",
     "ViT": "Vision Transformer (ViT)",
-    "conditional_detr": "Conditional DETR"
 }
 
 # Update this list with the models that shouldn't be in the README. This only concerns modular models or those who do

From 7c7f38e572ad6c0c7ca6bc4d567b54d5eaa1f2a9 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 09:54:05 -0400
Subject: [PATCH 216/233] Update
 src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 9e70e1b873cd6..747f514737268 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -510,6 +510,7 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrAttention with DETR->Conditional DETR
 class DetrAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.

From 8c1992dc53370c94b6d585d05aadf7272808fb59 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 10:26:13 -0400
Subject: [PATCH 217/233] added some copied from

---
 .../feature_extraction_conditional_detr.py                 | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index be85334d7b354..4405a8d81b4b4 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -102,7 +102,6 @@ def rgb_to_id(color):
         return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
-
 # Copied from transformers.models.detr.feature_extraction_detr.id_to_rgb
 def id_to_rgb(id_map):
     if isinstance(id_map, np.ndarray):
@@ -309,6 +308,7 @@ def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
     def _resize(self, image, size, target=None, max_size=None):
         """
         Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
@@ -379,6 +379,7 @@ def get_size(image_size, size, max_size=None):
 
         return rescaled_image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
     def _normalize(self, image, mean, std, target=None):
         """
         Normalize the image with a certain mean and std.
@@ -400,7 +401,8 @@ def _normalize(self, image, mean, std, target=None):
             target["boxes"] = boxes
 
         return image, target
-
+    
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._call with DETR->Conditional DETR,Detr->ConditionalDetr
     def __call__(
         self,
         images: ImageInput,
@@ -623,6 +625,7 @@ def __call__(
 
         return encoded_inputs
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
     def _max_by_axis(self, the_list):
         # type: (List[List[int]]) -> List[int]
         maxes = the_list[0]

From 5614267070f00557a9a5edb41405bed46327e18a Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 10:27:39 -0400
Subject: [PATCH 218/233] added some copied from

---
 .../conditional_detr/feature_extraction_conditional_detr.py   | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 4405a8d81b4b4..9d09148c6b1fd 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -308,7 +308,6 @@ def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
 
         return image, target
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
     def _resize(self, image, size, target=None, max_size=None):
         """
         Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
@@ -379,7 +378,6 @@ def get_size(image_size, size, max_size=None):
 
         return rescaled_image, target
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
     def _normalize(self, image, mean, std, target=None):
         """
         Normalize the image with a certain mean and std.
@@ -402,7 +400,6 @@ def _normalize(self, image, mean, std, target=None):
 
         return image, target
     
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._call with DETR->Conditional DETR,Detr->ConditionalDetr
     def __call__(
         self,
         images: ImageInput,
@@ -625,7 +622,6 @@ def __call__(
 
         return encoded_inputs
 
-    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
     def _max_by_axis(self, the_list):
         # type: (List[List[int]]) -> List[int]
         maxes = the_list[0]

From 6dc221662230b4c5f6adcbc4d23bfd444f5f707d Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 11:40:45 -0400
Subject: [PATCH 219/233] added some copied from

---
 .../models/conditional_detr/modeling_conditional_detr.py       | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 747f514737268..9ea9490c9befc 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -510,7 +510,6 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrAttention with DETR->Conditional DETR
 class DetrAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
@@ -2563,7 +2562,6 @@ def _max_by_axis(the_list):
             maxes[index] = max(maxes[index], item)
     return maxes
 
-
 # Copied from transformers.models.detr.modeling_detr.NestedTensor
 class NestedTensor(object):
     def __init__(self, tensors, mask: Optional[Tensor]):
@@ -2585,7 +2583,6 @@ def decompose(self):
     def __repr__(self):
         return str(self.tensors)
 
-
 # Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
 def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     if tensor_list[0].ndim == 3:

From f9c65fcaad65c773d02c5e5c533c316eafb93bfb Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 13 Sep 2022 11:43:10 -0400
Subject: [PATCH 220/233] added some copied from

---
 .../conditional_detr/feature_extraction_conditional_detr.py    | 3 ++-
 .../models/conditional_detr/modeling_conditional_detr.py       | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 9d09148c6b1fd..be85334d7b354 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -102,6 +102,7 @@ def rgb_to_id(color):
         return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
+
 # Copied from transformers.models.detr.feature_extraction_detr.id_to_rgb
 def id_to_rgb(id_map):
     if isinstance(id_map, np.ndarray):
@@ -399,7 +400,7 @@ def _normalize(self, image, mean, std, target=None):
             target["boxes"] = boxes
 
         return image, target
-    
+
     def __call__(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 9ea9490c9befc..9e70e1b873cd6 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -2562,6 +2562,7 @@ def _max_by_axis(the_list):
             maxes[index] = max(maxes[index], item)
     return maxes
 
+
 # Copied from transformers.models.detr.modeling_detr.NestedTensor
 class NestedTensor(object):
     def __init__(self, tensors, mask: Optional[Tensor]):
@@ -2583,6 +2584,7 @@ def decompose(self):
     def __repr__(self):
         return str(self.tensors)
 
+
 # Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
 def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     if tensor_list[0].ndim == 3:

From 40d9e033c6d829a82ec4f2298d9d027a7c15481d Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 19 Sep 2022 11:05:44 -0400
Subject: [PATCH 221/233] fix style quality and copies

---
 src/transformers/__init__.py                  |  6 +++
 src/transformers/models/auto/modeling_auto.py |  1 +
 .../modeling_conditional_detr.py              | 44 ++++++++++---------
 3 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d2efd10af9141..344e842114ea7 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3560,6 +3560,12 @@
             ConditionalDetrModel,
             ConditionalDetrPreTrainedModel,
         )
+        from .models.deformable_detr import (
+            DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DeformableDetrForObjectDetection,
+            DeformableDetrModel,
+            DeformableDetrPreTrainedModel,
+        )
         from .models.detr import (
             DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
             DetrForObjectDetection,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index ca4d2d8876f01..2ae4542d58bf6 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -460,6 +460,7 @@
         ("conditional_detr", "ConditionalDetrForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
         ("conditional_detr", "ConditionalDETRForObjectDetection"),
+        ("deformable_detr", "DeformableDetrForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
     ]
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 9e70e1b873cd6..a8d272f312cd2 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -280,7 +280,7 @@ class ConditionalDetrFrozenBatchNorm2d(nn.Module):
     """
 
     def __init__(self, n):
-        super(ConditionalDetrFrozenBatchNorm2d, self).__init__()
+        super().__init__()
         self.register_buffer("weight", torch.ones(n))
         self.register_buffer("bias", torch.zeros(n))
         self.register_buffer("running_mean", torch.zeros(n))
@@ -293,7 +293,7 @@ def _load_from_state_dict(
         if num_batches_tracked_key in state_dict:
             del state_dict[num_batches_tracked_key]
 
-        super(ConditionalDetrFrozenBatchNorm2d, self)._load_from_state_dict(
+        super()._load_from_state_dict(
             state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
         )
 
@@ -398,14 +398,14 @@ def forward(self, pixel_values, pixel_mask):
 
 
 # Copied from transformers.models.detr.modeling_detr._expand_mask with Detr->ConditionalDetr
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[int] = None):
     """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`.
     """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
+    batch_size, source_len = mask.size()
+    target_len = target_len if target_len is not None else source_len
 
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_len, source_len).to(dtype)
 
     inverted_mask = 1.0 - expanded_mask
 
@@ -462,12 +462,12 @@ def __init__(self, embedding_dim=256):
         self.column_embeddings = nn.Embedding(50, embedding_dim)
 
     def forward(self, pixel_values, pixel_mask=None):
-        h, w = pixel_values.shape[-2:]
-        i = torch.arange(w, device=pixel_values.device)
-        j = torch.arange(h, device=pixel_values.device)
-        x_emb = self.column_embeddings(i)
-        y_emb = self.row_embeddings(j)
-        pos = torch.cat([x_emb.unsqueeze(0).repeat(h, 1, 1), y_emb.unsqueeze(1).repeat(1, w, 1)], dim=-1)
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
         pos = pos.permute(2, 0, 1)
         pos = pos.unsqueeze(0)
         pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
@@ -2538,15 +2538,17 @@ def generalized_box_iou(boxes1, boxes2):
     """
     # degenerate boxes gives inf / nan results
     # so do an early check
-    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
-    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
+        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
+    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
+        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
     iou, union = box_iou(boxes1, boxes2)
 
-    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
-    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 
-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
-    area = wh[:, :, 0] * wh[:, :, 1]
+    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
+    area = width_height[:, :, 0] * width_height[:, :, 1]
 
     return iou - (area - union) / area
 
@@ -2590,11 +2592,11 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     if tensor_list[0].ndim == 3:
         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
         batch_shape = [len(tensor_list)] + max_size
-        b, c, h, w = batch_shape
+        batch_size, num_channels, height, width = batch_shape
         dtype = tensor_list[0].dtype
         device = tensor_list[0].device
         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
-        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
         for img, pad_img, m in zip(tensor_list, tensor, mask):
             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
             m[: img.shape[1], : img.shape[2]] = False

From b5d5dbb02c85fe1603a346b944cb9d0a715a7648 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 19 Sep 2022 14:15:53 -0400
Subject: [PATCH 222/233] fix style quality and copies

---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index a8d272f312cd2..91c1a71d6aa82 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1330,7 +1330,7 @@ def forward(
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, target_len=input_shape[-1])
 
         # optional intermediate hidden states
         intermediate = () if self.config.auxiliary_loss else None

From d5655f69239a0fe6dee1b5ec6d591185968ef6f8 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Mon, 19 Sep 2022 14:22:24 -0400
Subject: [PATCH 223/233] fix style quality and copies

---
 .../models/conditional_detr/modeling_conditional_detr.py      | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 91c1a71d6aa82..20aa578737d70 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1330,7 +1330,9 @@ def forward(
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, target_len=input_shape[-1])
+            encoder_attention_mask = _expand_mask(
+                encoder_attention_mask, inputs_embeds.dtype, target_len=input_shape[-1]
+            )
 
         # optional intermediate hidden states
         intermediate = () if self.config.auxiliary_loss else None

From 6bdf41673c3c354980920de553209ac162444a4f Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 10:18:01 -0400
Subject: [PATCH 224/233] add more fix-copies

---
 .../conditional_detr/feature_extraction_conditional_detr.py    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index be85334d7b354..dedaa5ccda3df 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -211,9 +211,10 @@ def convert_coco_poly_to_mask(self, segmentations, height, width):
 
         return masks
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_detection with DETR->ConditionalDETR
     def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
         """
-        Convert the target in COCO format into the format expected by Conditional DETR.
+        Convert the target in COCO format into the format expected by ConditionalDETR.
         """
         w, h = image.size
 

From 1af93974669696951c6a236a023773a69d6ec08d Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 11:36:53 -0400
Subject: [PATCH 225/233] fixed some variable names & added more fix-copies

---
 src/transformers/models/auto/modeling_auto.py |  1 -
 .../feature_extraction_conditional_detr.py    | 10 +++++-
 .../modeling_conditional_detr.py              | 26 +++++++-------
 ...est_feature_extraction_conditional_detr.py | 36 +++++++++----------
 .../test_modeling_conditional_detr.py         |  2 +-
 5 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 2ae4542d58bf6..9c1136744e665 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -374,7 +374,6 @@
     [
         # Do not add new models here, this class will be deprecated in the future.
         # Model for Image Segmentation mapping
-        ("conditional_detr", "ConditionalDetrForSegmentation"),
         ("detr", "DetrForSegmentation"),
     ]
 )
diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index dedaa5ccda3df..f625a034d294b 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -152,6 +152,8 @@ class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtrac
 
     model_input_names = ["pixel_values", "pixel_mask"]
 
+
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.__init__
     def __init__(
         self,
         format="coco_detection",
@@ -172,11 +174,13 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else [0.485, 0.456, 0.406]  # ImageNet mean
         self.image_std = image_std if image_std is not None else [0.229, 0.224, 0.225]  # ImageNet std
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._is_valid_format
     def _is_valid_format(self, format):
         if format not in ["coco_detection", "coco_panoptic"]:
             raise ValueError(f"Format {format} not supported")
         return format
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare
     def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
         if self.format == "coco_detection":
             image, target = self.prepare_coco_detection(image, target, return_segmentation_masks)
@@ -310,6 +314,7 @@ def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
     def _resize(self, image, size, target=None, max_size=None):
         """
         Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
@@ -380,6 +385,7 @@ def get_size(image_size, size, max_size=None):
 
         return rescaled_image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
     def _normalize(self, image, mean, std, target=None):
         """
         Normalize the image with a certain mean and std.
@@ -402,6 +408,7 @@ def _normalize(self, image, mean, std, target=None):
 
         return image, target
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.__call__ with Detr->ConditionalDetr,DETR->ConditionalDETR
     def __call__(
         self,
         images: ImageInput,
@@ -549,7 +556,7 @@ def __call__(
             if annotations is not None:
                 annotations = [annotations]
 
-        # prepare (COCO annotations as a list of Dict -> CONDITIONAL DETR target as a single Dict per image)
+        # prepare (COCO annotations as a list of Dict -> ConditionalDETR target as a single Dict per image)
         if annotations is not None:
             for idx, (image, target) in enumerate(zip(images, annotations)):
                 if not isinstance(image, Image.Image):
@@ -624,6 +631,7 @@ def __call__(
 
         return encoded_inputs
 
+    # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
     def _max_by_axis(self, the_list):
         # type: (List[List[int]]) -> List[int]
         maxes = the_list[0]
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 20aa578737d70..188e1ab8f5dde 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -911,8 +911,7 @@ def forward(
         k_pos = self.sa_kpos_proj(query_position_embeddings)
         v = self.sa_v_proj(hidden_states)
 
-        bs, num_queries, n_model = q_content.shape
-        _, hw, _ = k_content.shape
+        _, num_queries, n_model = q_content.shape
 
         q = q_content + q_pos
         k = k_content + k_pos
@@ -936,8 +935,8 @@ def forward(
         k_content = self.ca_kcontent_proj(encoder_hidden_states)
         v = self.ca_v_proj(encoder_hidden_states)
 
-        bs, num_queries, n_model = q_content.shape
-        _, hw, _ = k_content.shape
+        batch_size, num_queries, n_model = q_content.shape
+        _, src_len, _ = k_content.shape
 
         k_pos = self.ca_kpos_proj(position_embeddings)
 
@@ -951,13 +950,13 @@ def forward(
             q = q_content
             k = k_content
 
-        q = q.view(bs, num_queries, self.nhead, n_model // self.nhead)
+        q = q.view(batch_size, num_queries, self.nhead, n_model // self.nhead)
         query_sine_embed = self.ca_qpos_sine_proj(query_sine_embed)
-        query_sine_embed = query_sine_embed.view(bs, num_queries, self.nhead, n_model // self.nhead)
-        q = torch.cat([q, query_sine_embed], dim=3).view(bs, num_queries, n_model * 2)
-        k = k.view(bs, hw, self.nhead, n_model // self.nhead)
-        k_pos = k_pos.view(bs, hw, self.nhead, n_model // self.nhead)
-        k = torch.cat([k, k_pos], dim=3).view(bs, hw, n_model * 2)
+        query_sine_embed = query_sine_embed.view(batch_size, num_queries, self.nhead, n_model // self.nhead)
+        q = torch.cat([q, query_sine_embed], dim=3).view(batch_size, num_queries, n_model * 2)
+        k = k.view(batch_size, src_len, self.nhead, n_model // self.nhead)
+        k_pos = k_pos.view(batch_size, src_len, self.nhead, n_model // self.nhead)
+        k = torch.cat([k, k_pos], dim=3).view(batch_size, src_len, n_model * 2)
 
         # Cross-Attention Block
         cross_attn_weights = None
@@ -1118,6 +1117,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrEncoder with Detr->ConditionalDetr,DETR->ConditionalDETR
 class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
@@ -1125,7 +1125,7 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
 
     The encoder updates the flattened feature map through multiple self-attention layers.
 
-    Small tweak for Conditional DETR:
+    Small tweak for ConditionalDETR:
 
     - position_embeddings are added to the forward pass.
 
@@ -1141,7 +1141,7 @@ def __init__(self, config: ConditionalDetrConfig):
 
         self.layers = nn.ModuleList([ConditionalDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
 
-        # in the original Conditional DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
+        # in the original ConditionalDETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1191,7 +1191,7 @@ def forward(
 
         # expand attention_mask
         if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
             attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
 
         encoder_states = () if output_hidden_states else None
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
index 371d97de0000b..1e99c513f3524 100644
--- a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -240,8 +240,8 @@ def test_equivalence_pad_and_create_pixel_mask(self):
         encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
         encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
 
-        assert torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        assert torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
+        self.assertTrue(torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4))
+        self.assertTrue(torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4))
 
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):
@@ -261,31 +261,31 @@ def test_call_pytorch_with_coco_detection_annotations(self):
         self.assertEqual(encoding["pixel_values"].shape, expected_shape)
 
         expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
 
         # verify area
         expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
-        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
         # verify boxes
         expected_boxes_shape = torch.Size([6, 4])
         self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
         expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
         # verify image_id
         expected_image_id = torch.tensor([39769])
-        assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
         # verify is_crowd
         expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
         # verify class_labels
         expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
-        assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
         # verify orig_size
         expected_orig_size = torch.tensor([480, 640])
-        assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
         # verify size
         expected_size = torch.tensor([800, 1066])
-        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
 
     @slow
     def test_call_pytorch_with_coco_panoptic_annotations(self):
@@ -308,31 +308,31 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
         self.assertEqual(encoding["pixel_values"].shape, expected_shape)
 
         expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
 
         # verify area
         expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
-        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
         # verify boxes
         expected_boxes_shape = torch.Size([6, 4])
         self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
         expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
         # verify image_id
         expected_image_id = torch.tensor([39769])
-        assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
         # verify is_crowd
         expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
         # verify class_labels
         expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
-        assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
         # verify masks
         expected_masks_sum = 822338
         self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
         # verify orig_size
         expected_orig_size = torch.tensor([480, 640])
-        assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
         # verify size
         expected_size = torch.tensor([800, 1066])
-        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index a67f0bc30638e..2c7be4d8f2b56 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -472,7 +472,7 @@ def test_inference_no_head(self):
             outputs = model(**encoding)
 
         expected_shape = torch.Size((1, 300, 256))
-        assert outputs.last_hidden_state.shape == expected_shape
+        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
         expected_slice = torch.tensor(
             [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]]
         ).to(torch_device)

From a7e68707f69a0c287de075f69ab2396a33177322 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 11:41:00 -0400
Subject: [PATCH 226/233] fixed some variable names & added more fix-copies

---
 .../feature_extraction_conditional_detr.py                | 1 -
 .../test_feature_extraction_conditional_detr.py           | 8 ++++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index f625a034d294b..74cee04a72c7f 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -152,7 +152,6 @@ class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtrac
 
     model_input_names = ["pixel_values", "pixel_mask"]
 
-
     # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.__init__
     def __init__(
         self,
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
index 1e99c513f3524..ddf7e66ef72fe 100644
--- a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
+++ b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
@@ -240,8 +240,12 @@ def test_equivalence_pad_and_create_pixel_mask(self):
         encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
         encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
 
-        self.assertTrue(torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4))
-        self.assertTrue(torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4))
+        self.assertTrue(
+            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
+        )
+        self.assertTrue(
+            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
+        )
 
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):

From 5c2314e25e23693f10c81ae2a715d3df510ceb10 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 10:36:47 -0400
Subject: [PATCH 227/233] Update
 src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../conditional_detr/feature_extraction_conditional_detr.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
index 74cee04a72c7f..96b9fa69db04e 100644
--- a/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
@@ -122,7 +122,7 @@ def id_to_rgb(id_map):
 
 class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
-    Constructs a CONDITIONAL DETR feature extractor.
+    Constructs a Conditional DETR feature extractor.
 
     This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
     should refer to this superclass for more information regarding those methods.

From d462afa4be57d617b3904712973f16f0d6470951 Mon Sep 17 00:00:00 2001
From: DepuMeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 14:14:02 -0400
Subject: [PATCH 228/233] Update
 src/transformers/models/conditional_detr/configuration_conditional_detr.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/conditional_detr/configuration_conditional_detr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index df6673e9ebfef..2890fd2be39d7 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -27,8 +27,8 @@
 logger = logging.get_logger(__name__)
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "microsoft/conditional_detr_resnet50": (
-        "https://huggingface.co/microsoft/conditional_detr_resnet50/resolve/main/config.json"
+    "microsoft/conditional-detr-resnet-50": (
+        "https://huggingface.co/microsoft/conditional_detr_resnet-50/resolve/main/config.json"
     ),
 }
 

From 06464825d5b04d872490ca69ac930db2c383066e Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 14:18:53 -0400
Subject: [PATCH 229/233] added more copied from

---
 .../conditional_detr/modeling_conditional_detr.py     | 11 +++++++++--
 utils/check_repo.py                                   |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 188e1ab8f5dde..92044c4d30e02 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1013,8 +1013,15 @@ def forward(self, hidden_states: torch.Tensor):
         return hidden_states
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with DetrMLPPredictionHead->MLP
 class MLP(nn.Module):
-    """Very simple multi-layer perceptron (also called FFN)"""
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
 
     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
         super().__init__()
@@ -1024,7 +1031,7 @@ def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
 
     def forward(self, x):
         for i, layer in enumerate(self.layers):
-            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
         return x
 
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 8659f795602e2..988967e797d12 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -167,6 +167,7 @@
     "FlaxCLIPVisionModel",
     "FlaxWav2Vec2ForCTC",
     "DetrForSegmentation",
+    "ConditionalDetrForSegmentation",
     "DPRReader",
     "FlaubertForQuestionAnswering",
     "FlavaImageCodebook",

From e20381423b20ccb81a2beea5b09ba66d7b83b5a8 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 14:25:39 -0400
Subject: [PATCH 230/233] fixed quality

---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 92044c4d30e02..c18a4aaee24bd 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -22,7 +22,6 @@
 
 import torch
 from torch import Tensor, nn
-from torch.nn import functional as F
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput

From 61fb0dcf761d47af420ae62cddd612057ae7de72 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Tue, 20 Sep 2022 14:31:16 -0400
Subject: [PATCH 231/233] changed pretrained config

---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 2890fd2be39d7..afa6426bc3738 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -28,7 +28,7 @@
 
 CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "microsoft/conditional-detr-resnet-50": (
-        "https://huggingface.co/microsoft/conditional_detr_resnet-50/resolve/main/config.json"
+        "https://huggingface.co/microsoft/conditional-detr-resnet-50/resolve/main/config.json"
     ),
 }
 

From 5eecfb1cffae0f2783e58dbecd33a0a34ae615db Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Wed, 21 Sep 2022 10:25:57 -0400
Subject: [PATCH 232/233] added more copied-from and fixed the issue in
 feature_extraction_auto

---
 .../models/auto/feature_extraction_auto.py    |  2 +-
 .../modeling_conditional_detr.py              | 61 +++++++++++--------
 2 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 015fd132ef0dc..cb75f439c233d 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,6 +39,7 @@
     [
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
+        ("conditional_detr", "ConditionalDetrFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
@@ -46,7 +47,6 @@
         ("deformable_detr", "DetrFeatureExtractor"),
         ("deit", "DeiTFeatureExtractor"),
         ("detr", "DetrFeatureExtractor"),
-        ("detr", "DetrFeatureExtractor"),
         ("donut", "DonutFeatureExtractor"),
         ("dpt", "DPTFeatureExtractor"),
         ("flava", "FlavaFeatureExtractor"),
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index c18a4aaee24bd..79199ce06e428 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -509,11 +509,12 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrAttention
 class DetrAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
 
-    Here, we add position embeddings to the queries and keys (as explained in the Conditional DETR paper).
+    Here, we add position embeddings to the queries and keys (as explained in the DETR paper).
     """
 
     def __init__(
@@ -541,8 +542,8 @@ def __init__(
         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
 
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
     def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
         return tensor if position_embeddings is None else tensor + position_embeddings
@@ -561,7 +562,7 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+        batch_size, target_len, embed_dim = hidden_states.size()
 
         # add position embeddings to the hidden states before projecting to queries and keys
         if position_embeddings is not None:
@@ -578,35 +579,36 @@ def forward(
         # get key, value proj
         if is_cross_attention:
             # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states_original), -1, bsz)
+            key_states = self._shape(self.k_proj(key_value_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(key_value_states_original), -1, batch_size)
         else:
             # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states_original), -1, bsz)
+            key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
 
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
         key_states = key_states.view(*proj_shape)
         value_states = value_states.view(*proj_shape)
 
-        src_len = key_states.size(1)
+        source_len = key_states.size(1)
 
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
 
-        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
             raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
                 f" {attn_weights.size()}"
             )
 
         if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+            if attention_mask.size() != (batch_size, 1, target_len, source_len):
                 raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+                    f" {attention_mask.size()}"
                 )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
@@ -615,8 +617,8 @@ def forward(
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
         else:
             attn_weights_reshaped = None
 
@@ -624,15 +626,15 @@ def forward(
 
         attn_output = torch.bmm(attn_probs, value_states)
 
-        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
@@ -755,6 +757,7 @@ def forward(
         return attn_output, attn_weights_reshaped
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrEncoderLayer with DetrEncoderLayer->ConditionalDetrEncoderLayer,DetrConfig->ConditionalDetrConfig
 class ConditionalDetrEncoderLayer(nn.Module):
     def __init__(self, config: ConditionalDetrConfig):
         super().__init__()
@@ -783,7 +786,8 @@ def forward(
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
             position_embeddings (`torch.FloatTensor`, *optional*): position embeddings, to be added to hidden_states.
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
@@ -1520,9 +1524,18 @@ def forward(
 
         >>> feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
         >>> model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50")
+
+        >>> # prepare image for the model
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
+
+        >>> # forward pass
         >>> outputs = model(**inputs)
+
+        >>> # the last hidden states are the final query embeddings of the Transformer decoder
+        >>> # these are of shape (batch_size, num_queries, hidden_size)
         >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 300, 256]
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (

From 31492a575c012c74ccc8a10730193c4de0fe9a08 Mon Sep 17 00:00:00 2001
From: deppmeng <mdpustc@gmail.com>
Date: Wed, 21 Sep 2022 12:15:03 -0400
Subject: [PATCH 233/233] rebased

---
 .../utils/dummy_timm_and_vision_objects.py    | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/src/transformers/utils/dummy_timm_and_vision_objects.py b/src/transformers/utils/dummy_timm_and_vision_objects.py
index e990c33d2d317..0696a17d3778a 100644
--- a/src/transformers/utils/dummy_timm_and_vision_objects.py
+++ b/src/transformers/utils/dummy_timm_and_vision_objects.py
@@ -3,6 +3,37 @@
 from ..utils import DummyObject, requires_backends
 
 
+CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class ConditionalDetrForObjectDetection(metaclass=DummyObject):
+    _backends = ["timm", "vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm", "vision"])
+
+
+class ConditionalDetrForSegmentation(metaclass=DummyObject):
+    _backends = ["timm", "vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm", "vision"])
+
+
+class ConditionalDetrModel(metaclass=DummyObject):
+    _backends = ["timm", "vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm", "vision"])
+
+
+class ConditionalDetrPreTrainedModel(metaclass=DummyObject):
+    _backends = ["timm", "vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm", "vision"])
+
+
 DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None