From 62043901693def0e89d5e7d4da3bee0521dcb0a5 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 10 Aug 2022 14:13:47 +0000
Subject: [PATCH 01/40] First draft

---
 README.md                                     |    1 +
 README_ko.md                                  |    1 +
 README_zh-hans.md                             |    1 +
 README_zh-hant.md                             |    1 +
 docs/source/en/index.mdx                      |    2 +
 docs/source/en/model_doc/x-clip.mdx           |   60 +
 src/transformers/__init__.py                  |   23 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 .../models/auto/feature_extraction_auto.py    |    1 +
 src/transformers/models/auto/modeling_auto.py |    1 +
 .../models/auto/processing_auto.py            |    1 +
 .../models/auto/tokenization_auto.py          |    7 +
 src/transformers/models/x_clip/__init__.py    |   65 +
 .../models/x_clip/configuration_x_clip.py     |  325 ++++
 .../convert_x_clip_original_pytorch_to_hf.py  |  136 ++
 .../models/x_clip/modeling_x_clip.py          | 1312 +++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |   31 +
 tests/models/x_clip/__init__.py               |    0
 tests/models/x_clip/test_modeling_x_clip.py   |  674 +++++++++
 utils/check_config_docstrings.py              |    1 +
 utils/check_repo.py                           |    2 +
 22 files changed, 2649 insertions(+)
 create mode 100644 docs/source/en/model_doc/x-clip.mdx
 create mode 100644 src/transformers/models/x_clip/__init__.py
 create mode 100644 src/transformers/models/x_clip/configuration_x_clip.py
 create mode 100644 src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
 create mode 100644 src/transformers/models/x_clip/modeling_x_clip.py
 create mode 100644 tests/models/x_clip/__init__.py
 create mode 100644 tests/models/x_clip/test_modeling_x_clip.py
diff --git a/README.md b/README.md
index 5f89bacf6415d..097d934f016b4 100644
--- a/README.md
+++ b/README.md
@@ -382,6 +382,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[X-XClip](https://huggingface.co/docs/transformers/main/model_doc/x-clip)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
diff --git a/README_ko.md b/README_ko.md
index cc0b790ad76a8..8d2f9a3c51e1c 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -334,6 +334,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[X-XClip](https://huggingface.co/docs/transformers/main/model_doc/x-clip)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index fe2fa45f71f39..f6b4d1b71e798 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -358,6 +358,7 @@ conda install -c huggingface transformers
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (来自 Facebook AI) 伴随论文 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 发布。
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (来自 Facebook AI) 伴随论文 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 由 Qiantong Xu, Alexei Baevski, Michael Auli 发布。
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[X-XClip](https://huggingface.co/docs/transformers/main/model_doc/x-clip)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。
 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 4f5a995476149..b12a945661d53 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -370,6 +370,7 @@ conda install -c huggingface transformers
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[X-XClip](https://huggingface.co/docs/transformers/main/model_doc/x-clip)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index ed04cad3dd9bf..f3da3efa34c9c 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -176,6 +176,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[X-XClip](model_doc/x-clip)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
@@ -312,6 +313,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 |     Wav2Vec2-Conformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           X-XClip           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            XGLM             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |       XLM-ProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/x-clip.mdx b/docs/source/en/model_doc/x-clip.mdx
new file mode 100644
index 0000000000000..2f04e33867c72
--- /dev/null
+++ b/docs/source/en/model_doc/x-clip.mdx
@@ -0,0 +1,60 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# X-CLIP
+
+## Overview
+
+The X-CLIP model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## XClipConfig
+
+[[autodoc]] XClipConfig
+    - from_text_vision_configs
+
+## XClipTextConfig
+
+[[autodoc]] XClipTextConfig
+
+## XClipVisionConfig
+
+[[autodoc]] XClipVisionConfig
+
+## XClipModel
+
+[[autodoc]] XClipModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## XClipTextModel
+
+[[autodoc]] XClipTextModel
+    - forward
+
+## XClipVisionModel
+
+[[autodoc]] XClipVisionModel
+    - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index aff905b97ec5d..644ca40a7256a 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -367,6 +367,12 @@
         "WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "WavLMConfig",
     ],
+    "models.x_clip": [
+        "X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "XClipConfig",
+        "XClipTextConfig",
+        "XClipVisionConfig",
+    ],
     "models.xglm": ["XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "XGLMConfig"],
     "models.xlm": ["XLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMConfig", "XLMTokenizer"],
     "models.xlm_prophetnet": ["XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMProphetNetConfig"],
@@ -983,6 +989,15 @@
             "CLIPVisionModel",
         ]
     )
+    _import_structure["models.x_clip"].extend(
+        [
+            "X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "XClipModel",
+            "XClipPreTrainedModel",
+            "XClipTextModel",
+            "XClipVisionModel",
+        ]
+    )
     _import_structure["models.convbert"].extend(
         [
             "CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3164,6 +3179,7 @@
     from .models.wav2vec2_phoneme import Wav2Vec2PhonemeCTCTokenizer
     from .models.wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
     from .models.wavlm import WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP, WavLMConfig
+    from .models.x_clip import X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, XClipConfig, XClipTextConfig, XClipVisionConfig
     from .models.xglm import XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XGLMConfig
     from .models.xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig, XLMTokenizer
     from .models.xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig
@@ -4464,6 +4480,13 @@
             WavLMModel,
             WavLMPreTrainedModel,
         )
+        from .models.x_clip import (
+            X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            XClipModel,
+            XClipPreTrainedModel,
+            XClipTextModel,
+            XClipVisionModel,
+        )
         from .models.xglm import XGLM_PRETRAINED_MODEL_ARCHIVE_LIST, XGLMForCausalLM, XGLMModel, XGLMPreTrainedModel
         from .models.xlm import (
             XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index fdf315b2257d8..9db07572a65d6 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -150,6 +150,7 @@
     wav2vec2_phoneme,
     wav2vec2_with_lm,
     wavlm,
+    x_clip,
     xglm,
     xlm,
     xlm_prophetnet,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index c9e6156a3843d..aaa9d3dcf8068 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -143,6 +143,7 @@
         ("wav2vec2", "Wav2Vec2Config"),
         ("wav2vec2-conformer", "Wav2Vec2ConformerConfig"),
         ("wavlm", "WavLMConfig"),
+        ("x-clip", "XClipConfig"),
         ("xglm", "XGLMConfig"),
         ("xlm", "XLMConfig"),
         ("xlm-prophetnet", "XLMProphetNetConfig"),
@@ -257,6 +258,7 @@
         ("vit_mae", "VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("wav2vec2", "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("wav2vec2-conformer", "WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("x-clip", "X_XClip_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xglm", "XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xlm", "XLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xlm-prophetnet", "XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -405,6 +407,7 @@
         ("wav2vec2-conformer", "Wav2Vec2-Conformer"),
         ("wav2vec2_phoneme", "Wav2Vec2Phoneme"),
         ("wavlm", "WavLM"),
+        ("x-clip", "X-XClip"),
         ("xglm", "XGLM"),
         ("xlm", "XLM"),
         ("xlm-prophetnet", "XLM-ProphetNet"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 3058aaa4334a2..8b3a4f6afdaf0 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -75,6 +75,7 @@
         ("vit_mae", "ViTFeatureExtractor"),
         ("wav2vec2", "Wav2Vec2FeatureExtractor"),
         ("wav2vec2-conformer", "Wav2Vec2FeatureExtractor"),
+        ("x-clip", "XClipFeatureExtractor"),
         ("yolos", "YolosFeatureExtractor"),
     ]
 )
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 0e026cb48d0c0..c7b431923fb28 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -137,6 +137,7 @@
         ("wav2vec2", "Wav2Vec2Model"),
         ("wav2vec2-conformer", "Wav2Vec2ConformerModel"),
         ("wavlm", "WavLMModel"),
+        ("x-clip", "XClipModel"),
         ("xglm", "XGLMModel"),
         ("xlm", "XLMModel"),
         ("xlm-prophetnet", "XLMProphetNetModel"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index c6f4fd98316a4..f5f1f40818848 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -58,6 +58,7 @@
         ("wav2vec2-conformer", "Wav2Vec2Processor"),
         ("wav2vec2_with_lm", "Wav2Vec2ProcessorWithLM"),
         ("wavlm", "Wav2Vec2Processor"),
+        ("x-clip", "XClipProcessor"),
     ]
 )
 
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 8ece13b79fe3f..3f68227cd852d 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -253,6 +253,13 @@
             ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
             ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
             ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
+            (
+                "x-clip",
+                (
+                    "CLIPTokenizer",
+                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             (
                 "xglm",
                 (
diff --git a/src/transformers/models/x_clip/__init__.py b/src/transformers/models/x_clip/__init__.py
new file mode 100644
index 0000000000000..968d3daac373d
--- /dev/null
+++ b/src/transformers/models/x_clip/__init__.py
@@ -0,0 +1,65 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_x_clip": ["X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP", "XClipConfig", "XClipTextConfig", "XClipVisionConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_x_clip"] = [
+        "X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "XClipModel",
+        "XClipPreTrainedModel",
+        "XClipTextModel",
+        "XClipVisionModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_x_clip import X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, XClipConfig, XClipTextConfig, XClipVisionConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_x_clip import (
+            X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            XClipModel,
+            XClipPreTrainedModel,
+            XClipTextModel,
+            XClipVisionModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
new file mode 100644
index 0000000000000..fbd6b0ddf5f9c
--- /dev/null
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -0,0 +1,325 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XClip model configuration"""
+
+import copy
+import os
+from typing import Union
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "microsoft/xclip-base-patch32": "https://huggingface.co/microsoft/xclip-base-patch32/resolve/main/config.json",
+}
+
+
+
+class XClipTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`XClipModel`]. It is used to instantiate an XClip
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the XClip
+    [microsoft/xclip-base-patch32](https://huggingface.co/microsoft/xclip-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the XClip text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`XClipModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
+            defaults to 1e-5): The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import XClipTextModel, XClipTextConfig
+
+    >>> # Initializing a XClipTextModel with microsoft/xclip-base-patch32 style configuration
+    >>> configuration = XClipTextConfig()
+
+    >>> # Initializing a XClipTextConfig from the microsoft/xclip-base-patch32 style configuration
+    >>> model = XClipTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "x_clip_text_model"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from XClipConfig
+        if config_dict.get("model_type") == "x_clip":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class XClipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`XClipModel`]. It is used to instantiate an XClip
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the XClip
+    [microsoft/xclip-base-patch32](https://huggingface.co/microsoft/xclip-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
+            defaults to 1e-5): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate.
+
+    Example:
+
+    ```python
+    >>> from transformers import XClipVisionModel, XClipVisionConfig
+
+    >>> # Initializing a XClipVisionModel with microsoft/xclip-base-patch32 style configuration
+    >>> configuration = XClipVisionConfig()
+
+    >>> # Initializing a XClipVisionModel model from the microsoft/xclip-base-patch32 style configuration
+    >>> model = XClipVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "x_clip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        num_frames=8,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        drop_path_rate=0.0,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.num_frames = num_frames
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.drop_path_rate = drop_path_rate
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from XClipConfig
+        if config_dict.get("model_type") == "x_clip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class XClipConfig(PretrainedConfig):
+    r"""
+    [`XClipConfig`] is the configuration class to store the configuration of a [`XClipModel`]. It is used to
+    instantiate XClip model according to the specified arguments, defining the text model and vision model configs.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config_dict (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`XClipTextConfig`].
+        vision_config_dict (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`XClipVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original XClip implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    """
+
+    model_type = "x-clip"
+    is_composition = True
+
+    def __init__(
+        self,
+        text_config_dict=None,
+        vision_config_dict=None,
+        projection_dim=512,
+        logit_scale_init_value=2.6592,
+        **kwargs
+    ):
+        super().__init__(text_config_dict=text_config_dict, vision_config_dict=vision_config_dict, **kwargs)
+
+        if text_config_dict is None:
+            text_config_dict = {}
+            logger.info("text_config_dict is None. Initializing the XClipTextConfig with default values.")
+
+        if vision_config_dict is None:
+            vision_config_dict = {}
+            logger.info("vision_config_dict is None. initializing the XClipVisionConfig with default values.")
+
+        self.text_config = XClipTextConfig(**text_config_dict)
+        self.vision_config = XClipVisionConfig(**vision_config_dict)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: XClipTextConfig, vision_config: XClipVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`XClipConfig`] (or a derived class) from x_clip text model configuration and x_clip vision model
+        configuration.
+
+        Returns:
+            [`XClipConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config_dict=text_config.to_dict(), vision_config_dict=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
new file mode 100644
index 0000000000000..aa89057ef7d80
--- /dev/null
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+import numpy as np
+import torch
+from PIL import Image
+
+import requests
+from flax.training import checkpoints
+from flax.traverse_util import flatten_dict
+from huggingface_hub import hf_hub_download
+from transformers import AutoFeatureExtractor, XClipConfig, XClipModel
+
+
+def get_xclip_config(model_name):
+    config = XClipConfig()
+    return config
+
+
+def rename_key(name):
+    # text encoder
+    if name == "token_embedding.weight":
+        name = name.replace("token_embedding.weight", "text_model.embeddings.token_embedding.weight")
+    if name == "positional_embedding":
+        name = name.replace("positional_embedding", "text_model.embeddings.position_embedding.weight")
+    if "ln_1" in name:
+        name = name.replace("ln_1", "layer_norm1")
+    if "ln_2" in name:
+        name = name.replace("ln_2", "layer_norm2")
+    if "c_fc" in name:
+        name = name.replace("c_fc", "fc1")
+    if "c_proj" in name:
+        name = name.replace("c_proj", "fc2")
+    if name.startswith("transformer.resblocks"):
+        name = name.replace("transformer.resblocks", "text_model.encoder.layers")
+    if "attn.out_proj" in name:
+        name = name.replace("attn.out_proj", "self_attn.out_proj")
+    # visual encoder
+
+    return name
+
+
+def convert_state_dict(orig_state_dict, config):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        dim = config.text_config.hidden_size
+        
+        print("Old key:", key)
+        
+        if "attn.in_proj" in key and "visual" not in key:
+            key_split = key.split(".")
+            layer_num = key_split[2]
+            if "weight" in key:
+                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
+                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
+                    dim : dim * 2, :
+                ]
+                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
+            else:
+                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
+                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
+                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
+        else:
+            new_key_name = rename_key(key)
+            print("New key:", new_key_name)
+            orig_state_dict[rename_key(key)] = val
+
+    return orig_state_dict
+
+
+def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_path):
+    config = get_xclip_config(model_name)
+    model = XClipModel(config)
+    model.eval()
+
+    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)['model']
+    state_dict = convert_state_dict(state_dict, config)
+    
+    model = XClipModel(config)
+    model.load_state_dict(state_dict)
+
+    # url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+
+    # feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(model_name.replace("_", "-")))
+    # image = Image.open(requests.get(url, stream=True).raw)
+    # inputs = feature_extractor(images=image, return_tensors="pt")
+
+    # timm_outs = timm_model(inputs["pixel_values"])
+    # hf_outs = model(**inputs).logits
+
+    # assert torch.allclose(timm_outs, hf_outs, atol=1e-3)
+
+    # print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+    # model.save_pretrained(pytorch_dump_folder_path)
+
+    # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+    # feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--checkpoint_url",
+        default="https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_8.pth",
+        type=str,
+        help="URL fo the original PyTorch checkpoint (.pth file).",
+    )
+    parser.add_argument(
+        "--model_name",
+        default="xclip-base-patch32",
+        type=str,
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+
+    args = parser.parse_args()
+    convert_xclip_checkpoint(args.checkpoint_url, args.model_name, args.pytorch_dump_folder_path)
\ No newline at end of file
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
new file mode 100644
index 0000000000000..9ead750c34208
--- /dev/null
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -0,0 +1,1312 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch X-CLIP model."""
+
+
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_x_clip import XClipConfig, XClipTextConfig, XClipVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "microsoft/xclip-base-patch32"
+
+X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "microsoft/xclip-base-patch32",
+    # See all X-CLIP models at https://huggingface.co/models?filter=x-clip
+]
+
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/XClip.html
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
+
+
+# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->x_clip
+def x_clip_loss(similarity: torch.Tensor) -> torch.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.T)
+    return (caption_loss + image_loss) / 2.0
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->XClip
+class XClipOutput(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`XClipTextModel`].
+        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of
+            [`XClipVisionModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`XClipTextModel`].
+        vision_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`XClipVisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->XClip
+class XClipVisionEmbeddings(nn.Module):
+    def __init__(self, config: XClipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size, bias=False
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->XClip
+class XClipTextEmbeddings(nn.Module):
+    def __init__(self, config: XClipTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->XClip
+class XClipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->XClip
+class XClipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->XClip
+class XClipEncoderLayer(nn.Module):
+    def __init__(self, config: XClipConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = XClipAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.mlp = XClipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->XClip
+class XClipDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class XClipVisionEncoderLayer(nn.Module):
+    """
+    This corresponds to the `CrossFramelAttentionBlock` class in the original implementation.
+    """
+    def __init__(self, config: XClipConfig):
+        super().__init__()
+        self.T = config.num_frames
+        
+        self.embed_dim = config.hidden_size
+
+        self.message_fc = nn.Linear(self.embed_dim, self.embed_dim)
+        self.message_ln = nn.LayerNorm(self.embed_dim)
+        self.message_attn = XClipAttention(config)
+        
+        self.drop_path = XClipDropPath(config.drop_path_rate) if config.drop_path_rate > 0. else nn.Identity()
+
+        self.self_attn = XClipAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.mlp = XClipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        l, bt, d = hidden_states.size()
+        b = bt // self.T
+        x = x.view(l, b, self.T, d) 
+
+        msg_token = self.message_fc(hidden_states[0,:,:,:]) 
+        msg_token = msg_token.view(b, self.T, 1, d) 
+        
+        msg_token = msg_token.permute(1,2,0,3).view(self.T, b, d) 
+        msg_token = msg_token + self.drop_path(self.message_attn(self.message_ln(msg_token), self.message_ln(msg_token),self.message_ln(msg_token),need_weights=False)[0])
+        msg_token = msg_token.view(self.T, 1, b, d).permute(1,2,0,3)
+        
+        hidden_states = torch.cat([hidden_states, msg_token], dim=0)
+        
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPPreTrainedModel with CLIP->XClip,clip->x_clip
+class XClipPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = XClipConfig
+    base_model_prefix = "x_clip"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, XClipTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, XClipVisionEmbeddings):
+            factor = self.config.initializer_factor
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, XClipAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, XClipMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, XClipModel):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, XClipEncoder):
+            module.gradient_checkpointing = value
+
+
+X_CLIP_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`XClipConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+X_CLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+X_CLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+X_CLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->XClip
+class XClipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`XClipEncoderLayer`].
+
+    Args:
+        config: XClipConfig
+    """
+
+    def __init__(self, config: XClipConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([XClipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class XClipTextTransformer(nn.Module):
+    def __init__(self, config: XClipTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = XClipTextEmbeddings(config)
+        self.encoder = XClipEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XClipTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify either input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        bsz, seq_len = input_shape
+        # X_CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/X_CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/x_clip/model.py#L324
+        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
+            hidden_states.device
+        )
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
+        mask.fill_(torch.tensor(torch.finfo(dtype).min))
+        mask.triu_(1)  # zero out the lower diagonal
+        mask = mask.unsqueeze(1)  # expand mask
+        return mask
+
+
+class XClipTextModel(XClipPreTrainedModel):
+    config_class = XClipTextConfig
+
+    def __init__(self, config: XClipTextConfig):
+        super().__init__(config)
+        self.text_model = XClipTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XClipTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import CLIPTokenizer, XClipTextModel
+
+        >>> model = XClipTextModel.from_pretrained("microsoft/xclip-base-patch32")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("microsoft/xclip-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class XClipVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`XClipVisionEncoderLayer`].
+
+    Args:
+        config: XClipConfig
+    """
+
+    def __init__(self, config: XClipConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([XClipVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class XClipVisionTransformer(nn.Module):
+    """
+    This corresponds to the `CrossFrameCommunicationTransformer` class in the original implementation.
+    """
+
+    def __init__(self, config: XClipVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = XClipVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim)
+        self.encoder = XClipVisionEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim)
+
+    @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XClipVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class XClipVisionModel(XClipPreTrainedModel):
+    config_class = XClipVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: XClipVisionConfig):
+        super().__init__(config)
+        self.vision_model = XClipVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XClipVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import CLIPProcessor, XClipVisionModel
+
+        >>> model = XClipVisionModel.from_pretrained("microsoft/xclip-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(X_CLIP_START_DOCSTRING)
+class XClipModel(XClipPreTrainedModel):
+    config_class = XClipConfig
+
+    def __init__(self, config: XClipConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, XClipTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type XClipTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, XClipVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type XClipVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = XClipTextTransformer(text_config)
+        self.vision_model = XClipVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`XClipTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import CLIPTokenizer, XClipModel
+
+        >>> model = XClipModel.from_pretrained("microsoft/xclip-base-patch32")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("microsoft/xclip-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
+    def get_video_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`XClipVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import CLIPProcessor, XClipModel
+
+        >>> model = XClipModel.from_pretrained("microsoft/xclip-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_video_features(**inputs)
+        ```"""
+        # Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        image = image.reshape(-1, num_channels, height, width)
+        
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        # TODO add the following:
+        # img_features = self.prompts_visual_ln(img_features)
+        # img_features = img_features @ self.prompts_visual_proj
+        
+        # cls_features = cls_features.view(b, t, -1)
+        # img_features = img_features.view(b,t,-1,cls_features.shape[-1])
+        
+        # video_features = self.mit(cls_features)
+
+        return image_features
+
+    @add_start_docstrings_to_model_forward(X_CLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=XClipOutput, config_class=XClipConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, XClipOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import CLIPProcessor, XClipModel
+
+        >>> model = XClipModel.from_pretrained("microsoft/xclip-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.T
+
+        loss = None
+        if return_loss:
+            loss = x_clip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return XClipOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 9c0db79e44b8d..ff16d25ec585e 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5168,6 +5168,37 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class XClipModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class XClipPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class XClipTextModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class XClipVisionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 XGLM_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/x_clip/__init__.py b/tests/models/x_clip/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
new file mode 100644
index 0000000000000..1c7dd31f3fc4a
--- /dev/null
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -0,0 +1,674 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch XClip model. """
+
+
+import inspect
+import os
+import tempfile
+import unittest
+
+import numpy as np
+
+import requests
+import transformers
+from transformers import XClipConfig, XClipTextConfig, XClipVisionConfig
+from transformers.testing_utils import (
+    is_flax_available,
+    is_pt_flax_cross_test,
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import XClipModel, XClipTextModel, XClipVisionModel
+    from transformers.models.x_clip.modeling_x_clip import X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import CLIPProcessor
+
+
+if is_flax_available():
+    import jax.numpy as jnp
+    from transformers.modeling_flax_pytorch_utils import (
+        convert_pytorch_state_dict_to_flax,
+        load_flax_weights_in_pytorch_model,
+    )
+
+
+class XClipVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return XClipVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = XClipVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class XClipVisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as X_CLIP does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (XClipVisionModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = XClipVisionModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XClipVisionConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="X_CLIP does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="XClipVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="XClipVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = XClipVisionModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class XClipTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask
+
+    def get_config(self):
+        return XClipTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = XClipTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class XClipTextModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (XClipTextModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = XClipTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XClipTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="X_CLIP does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="XClipTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="XClipTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = XClipTextModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class XClipModelTester:
+    def __init__(self, parent, is_training=True):
+        self.parent = parent
+        self.text_model_tester = XClipTextModelTester(parent)
+        self.vision_model_tester = XClipVisionModelTester(parent)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return XClipConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = XClipModel(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values, attention_mask)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class XClipModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (XClipModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+
+    def setUp(self):
+        self.model_tester = XClipModelTester(self)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="XClipModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    # override as the `logit_scale` parameter initilization is different for X_CLIP
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # check if `logit_scale` is initilized as per the original implementation
+                    if name == "logit_scale":
+                        self.assertAlmostEqual(
+                            param.data.item(),
+                            np.log(1 / 0.07),
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+
+            try:
+                input_ids = inputs_dict["input_ids"]
+                pixel_values = inputs_dict["pixel_values"]  # X_CLIP needs pixel_values
+                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save XClipConfig and check if we can load XClipVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = XClipVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save XClipConfig and check if we can load XClipTextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = XClipTextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+    # overwrite from common since FlaxXClipModel returns nested output
+    # which is not supported in the common test
+    @is_pt_flax_cross_test
+    def test_equivalence_pt_to_flax(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+
+                # load PyTorch class
+                pt_model = model_class(config).eval()
+                # Flax models don't use the `use_cache` option and cache is not returned as a default.
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
+
+                fx_model_class_name = "Flax" + model_class.__name__
+
+                if not hasattr(transformers, fx_model_class_name):
+                    return
+
+                fx_model_class = getattr(transformers, fx_model_class_name)
+
+                # load Flax class
+                fx_model = fx_model_class(config, dtype=jnp.float32)
+                # make sure only flax inputs are forward that actually exist in function args
+                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
+
+                # prepare inputs
+                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                # remove function args that don't exist in Flax
+                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
+
+                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
+                fx_model.params = fx_state
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                # convert inputs to Flax
+                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_outputs = fx_model(**fx_inputs).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    pt_model.save_pretrained(tmpdirname)
+                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, from_pt=True)
+
+                fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple()
+                self.assertEqual(
+                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
+                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2)
+
+    # overwrite from common since FlaxXClipModel returns nested output
+    # which is not supported in the common test
+    @is_pt_flax_cross_test
+    def test_equivalence_flax_to_pt(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                # load corresponding PyTorch class
+                pt_model = model_class(config).eval()
+
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
+
+                fx_model_class_name = "Flax" + model_class.__name__
+
+                if not hasattr(transformers, fx_model_class_name):
+                    # no flax model exists for this class
+                    return
+
+                fx_model_class = getattr(transformers, fx_model_class_name)
+
+                # load Flax class
+                fx_model = fx_model_class(config, dtype=jnp.float32)
+                # make sure only flax inputs are forward that actually exist in function args
+                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
+
+                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
+
+                # make sure weights are tied in PyTorch
+                pt_model.tie_weights()
+
+                # prepare inputs
+                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                # remove function args that don't exist in Flax
+                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+
+                fx_outputs = fx_model(**fx_inputs).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+
+                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    fx_model.save_pretrained(tmpdirname)
+                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
+
+                with torch.no_grad():
+                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
+
+                self.assertEqual(
+                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = XClipModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@require_vision
+@require_torch
+class XClipModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "microsoft/xclip-base-patch32"
+        model = XClipModel.from_pretrained(model_name).to(torch_device)
+        processor = CLIPProcessor.from_pretrained(model_name)
+
+        image = prepare_img()
+        inputs = processor(
+            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt"
+        ).to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        self.assertEqual(
+            outputs.logits_per_image.shape,
+            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+        )
+        self.assertEqual(
+            outputs.logits_per_text.shape,
+            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+        )
+
+        expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py
index bcbbace39e0e7..f65a589b7889b 100644
--- a/utils/check_config_docstrings.py
+++ b/utils/check_config_docstrings.py
@@ -40,6 +40,7 @@
 
 
 CONFIG_CLASSES_TO_IGNORE_FOR_DOCSTRING_CHECKPOINT_CHECK = {
+    "XClipConfig",
     "CLIPConfig",
     "OwlViTConfig",
     "GroupViTConfig",
diff --git a/utils/check_repo.py b/utils/check_repo.py
index c3ecfbebe48ce..7fd6cdb670f89 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -125,6 +125,8 @@
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     # models to ignore for model xxx mapping
+    "XClipVisionModel",
+    "XClipTextModel",
     "DPTForDepthEstimation",
     "DecisionTransformerGPT2Model",
     "GLPNForDepthEstimation",

From 7e3f4bba5756a25519fbd3ec6aacea47feb79eed Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 10 Aug 2022 16:35:16 +0000
Subject: [PATCH 02/40] Improve conversion script

---
 src/transformers/models/x_clip/__init__.py    |  20 ++--
 .../models/x_clip/configuration_x_clip.py     |   1 -
 .../convert_x_clip_original_pytorch_to_hf.py  | 110 +++++++++++++-----
 .../models/x_clip/modeling_x_clip.py          |  46 +++++---
 src/transformers/models/x_clip/test.py        |   7 ++
 tests/models/x_clip/test_modeling_x_clip.py   |   4 +-
 6 files changed, 133 insertions(+), 55 deletions(-)
 create mode 100644 src/transformers/models/x_clip/test.py

diff --git a/src/transformers/models/x_clip/__init__.py b/src/transformers/models/x_clip/__init__.py
index 968d3daac373d..d4480f601f122 100644
--- a/src/transformers/models/x_clip/__init__.py
+++ b/src/transformers/models/x_clip/__init__.py
@@ -17,15 +17,16 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-)
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
-    "configuration_x_clip": ["X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP", "XClipConfig", "XClipTextConfig", "XClipVisionConfig"],
+    "configuration_x_clip": [
+        "X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "XClipConfig",
+        "XClipTextConfig",
+        "XClipVisionConfig",
+    ],
 }
 
 try:
@@ -43,7 +44,12 @@
     ]
 
 if TYPE_CHECKING:
-    from .configuration_x_clip import X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, XClipConfig, XClipTextConfig, XClipVisionConfig
+    from .configuration_x_clip import (
+        X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XClipConfig,
+        XClipTextConfig,
+        XClipVisionConfig,
+    )
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index fbd6b0ddf5f9c..7a0f014796805 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -29,7 +29,6 @@
 }
 
 
-
 class XClipTextConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`XClipModel`]. It is used to instantiate an XClip
diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index aa89057ef7d80..6c76deb4c8447 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -14,15 +14,9 @@
 # limitations under the License.
 
 import argparse
-import json
 
-import numpy as np
 import torch
-from PIL import Image
 
-import requests
-from flax.training import checkpoints
-from flax.traverse_util import flatten_dict
 from huggingface_hub import hf_hub_download
 from transformers import AutoFeatureExtractor, XClipConfig, XClipModel
 
@@ -48,9 +42,31 @@ def rename_key(name):
         name = name.replace("c_proj", "fc2")
     if name.startswith("transformer.resblocks"):
         name = name.replace("transformer.resblocks", "text_model.encoder.layers")
-    if "attn.out_proj" in name:
+    if "attn.out_proj" in name and "message" not in name:
         name = name.replace("attn.out_proj", "self_attn.out_proj")
     # visual encoder
+    if name == "visual.class_embedding":
+        name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding")
+    if name == "visual.positional_embedding":
+        name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding.weight")
+    if name.startswith("visual.transformer.resblocks"):
+        name = name.replace("visual.transformer.resblocks", "vision_model.encoder.layers")
+    if "visual.conv1" in name:
+        name = name.replace("visual.conv1", "vision_model.embeddings.patch_embedding")
+    if "visual.ln_pre" in name:
+        name = name.replace("visual.ln_pre", "vision_model.pre_layernorm")
+    if "visual.ln_post" in name:
+        name = name.replace("visual.ln_post", "vision_model.post_layernorm")
+    # things on top
+    if "ln_final" in name:
+        name = name.replace("ln_final", "final_layernorm")
+    if "visual.proj" in name:
+        name = name.replace("visual.proj", "visual_projection")
+    if "prompts_visual_proj" in name:
+        name = name.replace("prompts_visual_proj", "prompts_visual_projection")
+    if "prompts_visual_ln" in name:
+        name = name.replace("prompts_visual_ln", "prompts_visual_layernorm")
+    # TODO: prompts generator, mit
 
     return name
 
@@ -59,26 +75,66 @@ def convert_state_dict(orig_state_dict, config):
     for key in orig_state_dict.copy().keys():
         val = orig_state_dict.pop(key)
 
-        dim = config.text_config.hidden_size
-        
-        print("Old key:", key)
-        
-        if "attn.in_proj" in key and "visual" not in key:
+        if "attn.in_proj" in key:
             key_split = key.split(".")
-            layer_num = key_split[2]
-            if "weight" in key:
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
-                    dim : dim * 2, :
-                ]
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
+            if key.startswith("visual"):
+                layer_num = key_split[3]
+                dim = config.vision_config.hidden_size
+                if "message_attn" in key:
+                    if "weight" in key:
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.q_proj.weight"] = val[
+                            :dim, :
+                        ]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.k_proj.weight"] = val[
+                            dim : dim * 2, :
+                        ]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.v_proj.weight"] = val[
+                            -dim:, :
+                        ]
+                    else:
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.q_proj.bias"] = val[
+                            :dim
+                        ]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.k_proj.bias"] = val[
+                            dim : dim * 2
+                        ]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.v_proj.bias"] = val[
+                            -dim:
+                        ]
+                else:
+                    if "weight" in key:
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[
+                            :dim, :
+                        ]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
+                            dim : dim * 2, :
+                        ]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[
+                            -dim:, :
+                        ]
+                    else:
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[
+                            dim : dim * 2
+                        ]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
             else:
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
+                layer_num = key_split[2]
+                dim = config.text_config.hidden_size
+                if "weight" in key:
+                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
+                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
+                        dim : dim * 2, :
+                    ]
+                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
+                else:
+                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
+                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[
+                        dim : dim * 2
+                    ]
+                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
+
         else:
-            new_key_name = rename_key(key)
-            print("New key:", new_key_name)
             orig_state_dict[rename_key(key)] = val
 
     return orig_state_dict
@@ -89,9 +145,9 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
     model = XClipModel(config)
     model.eval()
 
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)['model']
+    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]
     state_dict = convert_state_dict(state_dict, config)
-    
+
     model = XClipModel(config)
     model.load_state_dict(state_dict)
 
@@ -133,4 +189,4 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
     )
 
     args = parser.parse_args()
-    convert_xclip_checkpoint(args.checkpoint_url, args.model_name, args.pytorch_dump_folder_path)
\ No newline at end of file
+    convert_xclip_checkpoint(args.checkpoint_url, args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 9ead750c34208..77ceaea557506 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -45,7 +45,6 @@
 ]
 
 
-
 # Copied from transformers.models.bart.modeling_bart._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
@@ -388,17 +387,18 @@ class XClipVisionEncoderLayer(nn.Module):
     """
     This corresponds to the `CrossFramelAttentionBlock` class in the original implementation.
     """
+
     def __init__(self, config: XClipConfig):
         super().__init__()
         self.T = config.num_frames
-        
+
         self.embed_dim = config.hidden_size
 
         self.message_fc = nn.Linear(self.embed_dim, self.embed_dim)
         self.message_ln = nn.LayerNorm(self.embed_dim)
         self.message_attn = XClipAttention(config)
-        
-        self.drop_path = XClipDropPath(config.drop_path_rate) if config.drop_path_rate > 0. else nn.Identity()
+
+        self.drop_path = XClipDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
 
         self.self_attn = XClipAttention(config)
         self.layer_norm1 = nn.LayerNorm(self.embed_dim)
@@ -424,17 +424,21 @@ def forward(
         """
         l, bt, d = hidden_states.size()
         b = bt // self.T
-        x = x.view(l, b, self.T, d) 
-
-        msg_token = self.message_fc(hidden_states[0,:,:,:]) 
-        msg_token = msg_token.view(b, self.T, 1, d) 
-        
-        msg_token = msg_token.permute(1,2,0,3).view(self.T, b, d) 
-        msg_token = msg_token + self.drop_path(self.message_attn(self.message_ln(msg_token), self.message_ln(msg_token),self.message_ln(msg_token),need_weights=False)[0])
-        msg_token = msg_token.view(self.T, 1, b, d).permute(1,2,0,3)
-        
+        x = x.view(l, b, self.T, d)
+
+        msg_token = self.message_fc(hidden_states[0, :, :, :])
+        msg_token = msg_token.view(b, self.T, 1, d)
+
+        msg_token = msg_token.permute(1, 2, 0, 3).view(self.T, b, d)
+        msg_token = msg_token + self.drop_path(
+            self.message_attn(
+                self.message_ln(msg_token), self.message_ln(msg_token), self.message_ln(msg_token), need_weights=False
+            )[0]
+        )
+        msg_token = msg_token.view(self.T, 1, b, d).permute(1, 2, 0, 3)
+
         hidden_states = torch.cat([hidden_states, msg_token], dim=0)
-        
+
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
@@ -971,7 +975,7 @@ def __init__(self, config: XClipVisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = XClipVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim)
+        self.pre_layernorm = nn.LayerNorm(embed_dim)
         self.encoder = XClipVisionEncoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim)
 
@@ -998,7 +1002,7 @@ def forward(
             raise ValueError("You have to specify pixel_values")
 
         hidden_states = self.embeddings(pixel_values)
-        hidden_states = self.pre_layrnorm(hidden_states)
+        hidden_states = self.pre_layernorm(hidden_states)
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
@@ -1103,10 +1107,14 @@ def __init__(self, config: XClipConfig):
         self.text_model = XClipTextTransformer(text_config)
         self.vision_model = XClipVisionTransformer(vision_config)
 
+        self.final_layernorm = nn.LayerNorm(text_config.hidden_size)
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
         self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
 
+        self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim)
+        self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.text_embed_dim))
+
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1196,7 +1204,7 @@ def get_video_features(
 
         batch_size, num_frames, num_channels, height, width = pixel_values.shape
         image = image.reshape(-1, num_channels, height, width)
-        
+
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
@@ -1210,10 +1218,10 @@ def get_video_features(
         # TODO add the following:
         # img_features = self.prompts_visual_ln(img_features)
         # img_features = img_features @ self.prompts_visual_proj
-        
+
         # cls_features = cls_features.view(b, t, -1)
         # img_features = img_features.view(b,t,-1,cls_features.shape[-1])
-        
+
         # video_features = self.mit(cls_features)
 
         return image_features
diff --git a/src/transformers/models/x_clip/test.py b/src/transformers/models/x_clip/test.py
new file mode 100644
index 0000000000000..62728d65c1e9e
--- /dev/null
+++ b/src/transformers/models/x_clip/test.py
@@ -0,0 +1,7 @@
+from transformers import XClipConfig, XClipModel
+
+config = XClipConfig()
+model = XClipModel(config)
+
+for name, param in model.named_parameters():
+    print(name, param.shape)
\ No newline at end of file
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 1c7dd31f3fc4a..63716aecaa25f 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -159,7 +159,9 @@ class XClipVisionModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = XClipVisionModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=XClipVisionConfig, has_text_modality=False, hidden_size=37)
+        self.config_tester = ConfigTester(
+            self, config_class=XClipVisionConfig, has_text_modality=False, hidden_size=37
+        )
 
     def test_config(self):
         self.config_tester.run_common_tests()

From 2fa856a6f5dec5b2db46da0f0f910df2a99ba826 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Thu, 11 Aug 2022 14:10:12 +0000
Subject: [PATCH 03/40] Make vision encoder work

---
 .../convert_x_clip_original_pytorch_to_hf.py  | 46 ++++++++++------
 .../models/x_clip/modeling_x_clip.py          | 55 ++++++++++++-------
 src/transformers/models/x_clip/test.py        |  3 +-
 3 files changed, 66 insertions(+), 38 deletions(-)

diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index 6c76deb4c8447..6b266e9f177aa 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -18,7 +18,7 @@
 import torch
 
 from huggingface_hub import hf_hub_download
-from transformers import AutoFeatureExtractor, XClipConfig, XClipModel
+from transformers import AutoTokenizer, XClipConfig, XClipModel
 
 
 def get_xclip_config(model_name):
@@ -44,6 +44,8 @@ def rename_key(name):
         name = name.replace("transformer.resblocks", "text_model.encoder.layers")
     if "attn.out_proj" in name and "message" not in name:
         name = name.replace("attn.out_proj", "self_attn.out_proj")
+    if "ln_final" in name:
+        name = name.replace("ln_final", "text_model.final_layer_norm")
     # visual encoder
     if name == "visual.class_embedding":
         name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding")
@@ -57,11 +59,11 @@ def rename_key(name):
         name = name.replace("visual.ln_pre", "vision_model.pre_layernorm")
     if "visual.ln_post" in name:
         name = name.replace("visual.ln_post", "vision_model.post_layernorm")
-    # things on top
-    if "ln_final" in name:
-        name = name.replace("ln_final", "final_layernorm")
     if "visual.proj" in name:
-        name = name.replace("visual.proj", "visual_projection")
+        name = name.replace("visual.proj", "visual_projection.weight")
+    if "text_projection" in name:
+        name = name.replace("text_projection", "text_projection.weight")
+    # things on top
     if "prompts_visual_proj" in name:
         name = name.replace("prompts_visual_proj", "prompts_visual_projection")
     if "prompts_visual_ln" in name:
@@ -134,8 +136,14 @@ def convert_state_dict(orig_state_dict, config):
                     ]
                     orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
 
+        elif key.startswith("prompts_generator") or key.startswith("mit"):
+            # TODO
+            pass
         else:
-            orig_state_dict[rename_key(key)] = val
+            new_key_name = rename_key(key)
+            if new_key_name in ["visual_projection.weight", "text_projection.weight"]:
+                val = val.T
+            orig_state_dict[new_key_name] = val
 
     return orig_state_dict
 
@@ -149,24 +157,30 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
     state_dict = convert_state_dict(state_dict, config)
 
     model = XClipModel(config)
-    model.load_state_dict(state_dict)
+    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+    assert missing_keys == ["text_model.embeddings.position_ids", "vision_model.embeddings.position_ids"]
+    model.eval()
 
-    # url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    file_path = hf_hub_download(
+        repo_id="hf-internal-testing/spaghetti-video-8-frames", filename="pixel_values.pt", repo_type="dataset"
+    )
+    pixel_values = torch.load(file_path)
 
     # feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(model_name.replace("_", "-")))
-    # image = Image.open(requests.get(url, stream=True).raw)
     # inputs = feature_extractor(images=image, return_tensors="pt")
 
-    # timm_outs = timm_model(inputs["pixel_values"])
-    # hf_outs = model(**inputs).logits
+    tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+    input_ids = tokenizer(["playing sports", "eating spaghetti", "go shopping"], return_tensors="pt").input_ids
 
-    # assert torch.allclose(timm_outs, hf_outs, atol=1e-3)
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids, pixel_values=pixel_values)
 
-    # print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-    # model.save_pretrained(pytorch_dump_folder_path)
+    # TODO verify outputs
+    print(outputs.logits_per_image)
 
-    # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    # feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 77ceaea557506..006fa1be2150e 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -422,22 +422,25 @@ def forward(
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
         """
-        l, bt, d = hidden_states.size()
+        bt, l, d = hidden_states.size()
         b = bt // self.T
-        x = x.view(l, b, self.T, d)
+        msg_token = self.message_fc(hidden_states[:, 0, :])
+        msg_token = msg_token.view(b, self.T, d)
 
-        msg_token = self.message_fc(hidden_states[0, :, :, :])
-        msg_token = msg_token.view(b, self.T, 1, d)
+        print("Shape of msg_token:", msg_token.shape)
+        print("Initial values of msg_token:", msg_token[0, :3, :3])
 
-        msg_token = msg_token.permute(1, 2, 0, 3).view(self.T, b, d)
-        msg_token = msg_token + self.drop_path(
-            self.message_attn(
-                self.message_ln(msg_token), self.message_ln(msg_token), self.message_ln(msg_token), need_weights=False
-            )[0]
-        )
-        msg_token = msg_token.view(self.T, 1, b, d).permute(1, 2, 0, 3)
+        msg_token = msg_token + self.drop_path(self.message_attn(self.message_ln(msg_token))[0])
+        # add dummy sequence dimension
+        msg_token = msg_token.view(-1, 1, d)
+
+        print("Shape of msg_token after self-attention:", msg_token.shape)
+        print("Initial values of msg_token after self-attention:", msg_token[0, :3, :3])
+
+        hidden_states = torch.cat([hidden_states, msg_token], dim=1)
 
-        hidden_states = torch.cat([hidden_states, msg_token], dim=0)
+        print("Shape of hidden states after concatentation:", hidden_states.shape)
+        print("Initial values of hidden states after concatenation:", hidden_states[0, :3, :3])
 
         residual = hidden_states
 
@@ -925,6 +928,10 @@ def forward(
 
         hidden_states = inputs_embeds
         for idx, encoder_layer in enumerate(self.layers):
+            if idx == 0:
+                print(f"Shape of hidden states before layer {idx}:", hidden_states.shape)
+                print(f"Initial values of hidden states before layer {idx}:", hidden_states[0, :3, :3])
+
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             if self.gradient_checkpointing and self.training:
@@ -951,6 +958,10 @@ def custom_forward(*inputs):
 
             hidden_states = layer_outputs[0]
 
+            if idx == 0:
+                print(f"Shape of hidden states after layer {idx}:", hidden_states.shape)
+                print(f"Initial values of hidden states after layer {idx}:", hidden_states[0, :3, :3])
+
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
 
@@ -983,7 +994,7 @@ def __init__(self, config: XClipVisionConfig):
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XClipVisionConfig)
     def forward(
         self,
-        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values: torch.FloatTensor,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -998,12 +1009,12 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
         hidden_states = self.embeddings(pixel_values)
         hidden_states = self.pre_layernorm(hidden_states)
 
+        print("Initial hidden states:", hidden_states.shape)
+        print("Initial values of hidden states:", hidden_states[0, :3, :3])
+
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
             output_attentions=output_attentions,
@@ -1107,7 +1118,6 @@ def __init__(self, config: XClipConfig):
         self.text_model = XClipTextTransformer(text_config)
         self.vision_model = XClipVisionTransformer(vision_config)
 
-        self.final_layernorm = nn.LayerNorm(text_config.hidden_size)
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
         self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
@@ -1203,7 +1213,7 @@ def get_video_features(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         batch_size, num_frames, num_channels, height, width = pixel_values.shape
-        image = image.reshape(-1, num_channels, height, width)
+        pixel_values = pixel_values.reshape(-1, num_channels, height, width)
 
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
@@ -1270,6 +1280,9 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        pixel_values = pixel_values.reshape(-1, num_channels, height, width)
+        
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
@@ -1291,7 +1304,7 @@ def forward(
 
         text_embeds = text_outputs[1]
         text_embeds = self.text_projection(text_embeds)
-
+        
         # normalized features
         image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
@@ -1306,7 +1319,7 @@ def forward(
             loss = x_clip_loss(logits_per_text)
 
         if not return_dict:
-            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_embeds, vision_outputs)
             return ((loss,) + output) if loss is not None else output
 
         return XClipOutput(
@@ -1315,6 +1328,6 @@ def forward(
             logits_per_text=logits_per_text,
             text_embeds=text_embeds,
             image_embeds=image_embeds,
-            text_model_output=text_outputs,
+            text_model_output=text_embeds,
             vision_model_output=vision_outputs,
         )
diff --git a/src/transformers/models/x_clip/test.py b/src/transformers/models/x_clip/test.py
index 62728d65c1e9e..b923826031edf 100644
--- a/src/transformers/models/x_clip/test.py
+++ b/src/transformers/models/x_clip/test.py
@@ -1,7 +1,8 @@
 from transformers import XClipConfig, XClipModel
 
+
 config = XClipConfig()
 model = XClipModel(config)
 
 for name, param in model.named_parameters():
-    print(name, param.shape)
\ No newline at end of file
+    print(name, param.shape)

From 51c4c5aca0f4007587ecad815a886ef7a0d8bbba Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Fri, 12 Aug 2022 17:15:13 +0000
Subject: [PATCH 04/40] More improvements

---
 .../convert_x_clip_original_pytorch_to_hf.py  |  4 +-
 .../models/x_clip/modeling_x_clip.py          | 39 +++++++++----------
 2 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index 6b266e9f177aa..b45ab4ba226ef 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -170,13 +170,13 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
     # inputs = feature_extractor(images=image, return_tensors="pt")
 
     tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-    input_ids = tokenizer(["playing sports", "eating spaghetti", "go shopping"], return_tensors="pt").input_ids
+    input_ids = tokenizer(["playing sports", "eating spaghetti", "go shopping"], padding="max_length", return_tensors="pt").input_ids
 
     with torch.no_grad():
         outputs = model(input_ids=input_ids, pixel_values=pixel_values)
 
     # TODO verify outputs
-    print(outputs.logits_per_image)
+    print(outputs.keys())
 
     if pytorch_dump_folder_path is not None:
         print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 006fa1be2150e..378fb4d2120c3 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -422,26 +422,19 @@ def forward(
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
         """
+        print("Initial values of hidden states:", hidden_states[0, :3, :3])
+
         bt, l, d = hidden_states.size()
         b = bt // self.T
         msg_token = self.message_fc(hidden_states[:, 0, :])
         msg_token = msg_token.view(b, self.T, d)
 
-        print("Shape of msg_token:", msg_token.shape)
-        print("Initial values of msg_token:", msg_token[0, :3, :3])
-
         msg_token = msg_token + self.drop_path(self.message_attn(self.message_ln(msg_token))[0])
         # add dummy sequence dimension
         msg_token = msg_token.view(-1, 1, d)
 
-        print("Shape of msg_token after self-attention:", msg_token.shape)
-        print("Initial values of msg_token after self-attention:", msg_token[0, :3, :3])
-
         hidden_states = torch.cat([hidden_states, msg_token], dim=1)
 
-        print("Shape of hidden states after concatentation:", hidden_states.shape)
-        print("Initial values of hidden states after concatenation:", hidden_states[0, :3, :3])
-
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
@@ -453,11 +446,15 @@ def forward(
         )
         hidden_states = residual + hidden_states
 
+        hidden_states = hidden_states[:, :l, :]
+
         residual = hidden_states
         hidden_states = self.layer_norm2(hidden_states)
         hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
 
+        print("Initial values of hidden states after residual:", hidden_states[0, :3, :3])
+
         outputs = (hidden_states,)
 
         if output_attentions:
@@ -775,6 +772,8 @@ def forward(
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
 
+        print("Initial hidden states of the Text encoder:", hidden_states[0, :3, :3])
+
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
             attention_mask=attention_mask,
@@ -791,6 +790,8 @@ def forward(
         # take features from the eot embedding (eot_token is the highest number in each sequence)
         pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)]
 
+        print("Initial values of the text pooled output:", pooled_output[0,:3])
+        
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
 
@@ -928,9 +929,7 @@ def forward(
 
         hidden_states = inputs_embeds
         for idx, encoder_layer in enumerate(self.layers):
-            if idx == 0:
-                print(f"Shape of hidden states before layer {idx}:", hidden_states.shape)
-                print(f"Initial values of hidden states before layer {idx}:", hidden_states[0, :3, :3])
+            print("---------LAYER ---------", idx)
 
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
@@ -958,10 +957,6 @@ def custom_forward(*inputs):
 
             hidden_states = layer_outputs[0]
 
-            if idx == 0:
-                print(f"Shape of hidden states after layer {idx}:", hidden_states.shape)
-                print(f"Initial values of hidden states after layer {idx}:", hidden_states[0, :3, :3])
-
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
 
@@ -1012,9 +1007,6 @@ def forward(
         hidden_states = self.embeddings(pixel_values)
         hidden_states = self.pre_layernorm(hidden_states)
 
-        print("Initial hidden states:", hidden_states.shape)
-        print("Initial values of hidden states:", hidden_states[0, :3, :3])
-
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
             output_attentions=output_attentions,
@@ -1026,6 +1018,9 @@ def forward(
         pooled_output = last_hidden_state[:, 0, :]
         pooled_output = self.post_layernorm(pooled_output)
 
+        print("Shape of pooled output:", pooled_output.shape)
+        print("Initial values of pooled output:", pooled_output[0, :3])
+
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
 
@@ -1282,7 +1277,7 @@ def forward(
 
         batch_size, num_frames, num_channels, height, width = pixel_values.shape
         pixel_values = pixel_values.reshape(-1, num_channels, height, width)
-        
+
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
@@ -1290,6 +1285,8 @@ def forward(
             return_dict=return_dict,
         )
 
+        print("Shape of input_ids:", input_ids.shape)
+
         text_outputs = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1304,7 +1301,7 @@ def forward(
 
         text_embeds = text_outputs[1]
         text_embeds = self.text_projection(text_embeds)
-        
+
         # normalized features
         image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

From 8fdc4a1bc2dc7b8e82481cba6a98bcaaf4296ff1 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 23 Aug 2022 09:12:02 +0000
Subject: [PATCH 05/40] Improve conversion script

---
 .../models/x_clip/configuration_x_clip.py     |  5 +++--
 .../convert_x_clip_original_pytorch_to_hf.py  |  7 ++++++-
 .../models/x_clip/modeling_x_clip.py          | 19 +++++++++++++------
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index 7a0f014796805..792835f1a627d 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -163,8 +163,9 @@ class XClipVisionConfig(PretrainedConfig):
             The size (resolution) of each patch.
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
-            defaults to 1e-5): The epsilon used by the layer normalization layers.
+            `"relu"`, `"selu"`, `"gelu_new"` and ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
         dropout (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (`float`, *optional*, defaults to 0.0):
diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index b45ab4ba226ef..f7326df8ffdf6 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -120,6 +120,9 @@ def convert_state_dict(orig_state_dict, config):
                             dim : dim * 2
                         ]
                         orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
+            elif "mit" in key:
+                # TODO: multihead self-attention of MIT
+                pass
             else:
                 layer_num = key_split[2]
                 dim = config.text_config.hidden_size
@@ -170,7 +173,9 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
     # inputs = feature_extractor(images=image, return_tensors="pt")
 
     tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-    input_ids = tokenizer(["playing sports", "eating spaghetti", "go shopping"], padding="max_length", return_tensors="pt").input_ids
+    input_ids = tokenizer(
+        ["playing sports", "eating spaghetti", "go shopping"], padding="max_length", return_tensors="pt"
+    ).input_ids
 
     with torch.no_grad():
         outputs = model(input_ids=input_ids, pixel_values=pixel_values)
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 378fb4d2120c3..5497328b4d06b 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -312,6 +312,7 @@ def forward(
         attention_mask: torch.Tensor,
         causal_attention_mask: torch.Tensor,
         output_attentions: Optional[bool] = False,
+        print_values=False,
     ) -> Tuple[torch.FloatTensor]:
         """
         Args:
@@ -326,12 +327,14 @@ def forward(
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
+
         hidden_states, attn_weights = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             causal_attention_mask=causal_attention_mask,
             output_attentions=output_attentions,
         )
+
         hidden_states = residual + hidden_states
 
         residual = hidden_states
@@ -706,6 +709,7 @@ def custom_forward(*inputs):
                     attention_mask,
                     causal_attention_mask,
                     output_attentions=output_attentions,
+                    print_values=idx == 0,
                 )
 
             hidden_states = layer_outputs[0]
@@ -786,12 +790,12 @@ def forward(
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.final_layer_norm(last_hidden_state)
 
+        print("Initial values of the text final hidden states:", last_hidden_state[0, :3, :3])
+
         # text_embeds.shape = [batch_size, sequence_length, transformer.width]
         # take features from the eot embedding (eot_token is the highest number in each sequence)
         pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)]
 
-        print("Initial values of the text pooled output:", pooled_output[0,:3])
-        
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
 
@@ -1018,9 +1022,6 @@ def forward(
         pooled_output = last_hidden_state[:, 0, :]
         pooled_output = self.post_layernorm(pooled_output)
 
-        print("Shape of pooled output:", pooled_output.shape)
-        print("Initial values of pooled output:", pooled_output[0, :3])
-
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
 
@@ -1220,6 +1221,8 @@ def get_video_features(
         pooled_output = vision_outputs[1]  # pooled_output
         image_features = self.visual_projection(pooled_output)
 
+        print("Shape of image features:", image_features.shape)
+
         # TODO add the following:
         # img_features = self.prompts_visual_ln(img_features)
         # img_features = img_features @ self.prompts_visual_proj
@@ -1285,7 +1288,8 @@ def forward(
             return_dict=return_dict,
         )
 
-        print("Shape of input_ids:", input_ids.shape)
+        # TODO remove this assertion (vision pooler output)
+        assert torch.allclose(vision_outputs.pooler_output[0, :3], torch.tensor([-0.2987, 1.0489, 0.3702]), atol=1e-4)
 
         text_outputs = self.text_model(
             input_ids=input_ids,
@@ -1302,6 +1306,9 @@ def forward(
         text_embeds = text_outputs[1]
         text_embeds = self.text_projection(text_embeds)
 
+        # TODO remove this assertion (text pooler output)
+        assert torch.allclose(text_embeds[0, :3], torch.tensor([-0.2870, -0.3504, 0.0417]), atol=1e-4)
+
         # normalized features
         image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

From 1bcbedce19d9127f86dd10bb8a06e6cc1c3ab128 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 23 Aug 2022 09:20:40 +0000
Subject: [PATCH 06/40] Fix quality

---
 src/transformers/models/x_clip/modeling_x_clip.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 5497328b4d06b..10875ea58401d 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -312,7 +312,6 @@ def forward(
         attention_mask: torch.Tensor,
         causal_attention_mask: torch.Tensor,
         output_attentions: Optional[bool] = False,
-        print_values=False,
     ) -> Tuple[torch.FloatTensor]:
         """
         Args:
@@ -327,14 +326,12 @@ def forward(
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-
         hidden_states, attn_weights = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             causal_attention_mask=causal_attention_mask,
             output_attentions=output_attentions,
         )
-
         hidden_states = residual + hidden_states
 
         residual = hidden_states
@@ -709,7 +706,6 @@ def custom_forward(*inputs):
                     attention_mask,
                     causal_attention_mask,
                     output_attentions=output_attentions,
-                    print_values=idx == 0,
                 )
 
             hidden_states = layer_outputs[0]
@@ -1308,6 +1304,7 @@ def forward(
 
         # TODO remove this assertion (text pooler output)
         assert torch.allclose(text_embeds[0, :3], torch.tensor([-0.2870, -0.3504, 0.0417]), atol=1e-4)
+        print("Looks ok!")
 
         # normalized features
         image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)

From c6c29d15ec4ae7398588241ab917cd8d0347aabe Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 23 Aug 2022 10:05:26 +0000
Subject: [PATCH 07/40] Add MultiframeIntegrationTransformer

---
 .../models/x_clip/configuration_x_clip.py     | 16 +++++++
 .../convert_x_clip_original_pytorch_to_hf.py  | 25 ++++++++--
 .../models/x_clip/modeling_x_clip.py          | 47 +++++++++++++++++++
 3 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index 792835f1a627d..9bf9c20927d44 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -157,6 +157,14 @@ class XClipVisionConfig(PretrainedConfig):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
+        mit_hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers of the Multiframe Integration Transformer (MIT).
+        mit_intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Multiframe Integration Transformer (MIT).
+        mit_num_hidden_layers (`int`, *optional*, defaults to 1):
+            Number of hidden layers in the Multiframe Integration Transformer (MIT).
+        mit_num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Multiframe Integration Transformer (MIT).
         image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
         patch_size (`int`, *optional*, defaults to 32):
@@ -201,6 +209,10 @@ def __init__(
         intermediate_size=3072,
         num_hidden_layers=12,
         num_attention_heads=12,
+        mit_hidden_size=512,
+        mit_intermediate_size=2048,
+        mit_num_hidden_layers=1,
+        mit_num_attention_heads=8,
         num_channels=3,
         image_size=224,
         patch_size=32,
@@ -221,6 +233,10 @@ def __init__(
         self.dropout = dropout
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
+        self.mit_hidden_size = mit_hidden_size
+        self.mit_intermediate_size = mit_intermediate_size
+        self.mit_num_hidden_layers = mit_num_hidden_layers
+        self.mit_num_attention_heads = mit_num_attention_heads
         self.num_channels = num_channels
         self.patch_size = patch_size
         self.num_frames = num_frames
diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index f7326df8ffdf6..7056f076ac332 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -63,6 +63,11 @@ def rename_key(name):
         name = name.replace("visual.proj", "visual_projection.weight")
     if "text_projection" in name:
         name = name.replace("text_projection", "text_projection.weight")
+    # mit
+    if name == "mit.positional_embedding":
+        name = name.replace("positional", "position")
+    if name.startswith("mit.resblocks"):
+        name = name.replace("mit.resblocks", "mit.encoder.layers")
     # things on top
     if "prompts_visual_proj" in name:
         name = name.replace("prompts_visual_proj", "prompts_visual_projection")
@@ -120,9 +125,21 @@ def convert_state_dict(orig_state_dict, config):
                             dim : dim * 2
                         ]
                         orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
-            elif "mit" in key:
-                # TODO: multihead self-attention of MIT
-                pass
+            elif key.startswith("mit"):
+                layer_num = key_split[2]
+                dim = config.vision_config.mit_hidden_size
+                if "weight" in key:
+                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
+                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
+                        dim : dim * 2, :
+                    ]
+                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
+                else:
+                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
+                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[
+                        dim : dim * 2
+                    ]
+                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
             else:
                 layer_num = key_split[2]
                 dim = config.text_config.hidden_size
@@ -139,7 +156,7 @@ def convert_state_dict(orig_state_dict, config):
                     ]
                     orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
 
-        elif key.startswith("prompts_generator") or key.startswith("mit"):
+        elif key.startswith("prompts_generator"):
             # TODO
             pass
         else:
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 10875ea58401d..e42fbb6f83aae 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -15,6 +15,7 @@
 """ PyTorch X-CLIP model."""
 
 
+from copy import copy
 from dataclasses import dataclass
 from typing import Any, Optional, Tuple, Union
 
@@ -1081,6 +1082,45 @@ def forward(
         )
 
 
+class XClipMultiframeIntegrationTransformer(nn.Module):
+    """
+    This corresponds to the `MultiframeIntegrationTransformer` class in the original implementation.
+    """
+
+    def __init__(self, config: XClipVisionConfig):
+        super().__init__()
+
+        self.position_embedding = nn.Parameter(torch.empty(1, config.num_frames, config.hidden_size))
+        self.encoder = XClipEncoder(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        residual = hidden_states
+        
+        # add position embeddings
+        hidden_states = hidden_states + self.position_embedding
+
+        # TODO support output hidden states and/or attentions
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+
+        last_hidden_state = last_hidden_state.type(hidden_states.dtype) + hidden_states
+
+        return last_hidden_state.mean(dim=1, keepdim=False)
+
+
 @add_start_docstrings(X_CLIP_START_DOCSTRING)
 class XClipModel(XClipPreTrainedModel):
     config_class = XClipConfig
@@ -1117,6 +1157,13 @@ def __init__(self, config: XClipConfig):
         self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim)
         self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.text_embed_dim))
 
+        mit_config = copy(vision_config)
+        mit_config.hidden_size = vision_config.mit_hidden_size
+        mit_config.intermediate_size = vision_config.mit_intermediate_size
+        mit_config.num_hidden_layers = vision_config.mit_num_hidden_layers
+        mit_config.num_attention_heads = vision_config.mit_num_attention_heads
+        self.mit = XClipMultiframeIntegrationTransformer(mit_config)
+
         # Initialize weights and apply final processing
         self.post_init()
 

From c324f2d9af90bf85a43660830128ade5b936c5f0 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 23 Aug 2022 10:12:45 +0000
Subject: [PATCH 08/40] More improvements

---
 src/transformers/models/x_clip/configuration_x_clip.py    | 3 ++-
 .../x_clip/convert_x_clip_original_pytorch_to_hf.py       | 8 ++------
 src/transformers/models/x_clip/modeling_x_clip.py         | 4 +---
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index 9bf9c20927d44..ad48e06f5f5cd 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -160,7 +160,8 @@ class XClipVisionConfig(PretrainedConfig):
         mit_hidden_size (`int`, *optional*, defaults to 512):
             Dimensionality of the encoder layers of the Multiframe Integration Transformer (MIT).
         mit_intermediate_size (`int`, *optional*, defaults to 2048):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Multiframe Integration Transformer (MIT).
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Multiframe Integration Transformer
+            (MIT).
         mit_num_hidden_layers (`int`, *optional*, defaults to 1):
             Number of hidden layers in the Multiframe Integration Transformer (MIT).
         mit_num_attention_heads (`int`, *optional*, defaults to 8):
diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index 7056f076ac332..a6e1d82786ce6 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -130,15 +130,11 @@ def convert_state_dict(orig_state_dict, config):
                 dim = config.vision_config.mit_hidden_size
                 if "weight" in key:
                     orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
-                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
-                        dim : dim * 2, :
-                    ]
+                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[dim : dim * 2, :]
                     orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
                 else:
                     orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
-                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[
-                        dim : dim * 2
-                    ]
+                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
                     orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
             else:
                 layer_num = key_split[2]
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index e42fbb6f83aae..fcee198a3be5a 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -70,7 +70,7 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
 # Copied from transformers.models.clip.modeling_clip.clip_loss with clip->x_clip
 def x_clip_loss(similarity: torch.Tensor) -> torch.Tensor:
     caption_loss = contrastive_loss(similarity)
-    image_loss = contrastive_loss(similarity.T)
+    image_loss = contrastive_loss(similarity.t())
     return (caption_loss + image_loss) / 2.0
 
 
@@ -1102,8 +1102,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutput]:
-        residual = hidden_states
-        
         # add position embeddings
         hidden_states = hidden_states + self.position_embedding
 

From 9384c4f5548d36253d4471b99cd629db20502d8f Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 23 Aug 2022 10:53:23 +0000
Subject: [PATCH 09/40] Make MiT output work

---
 .../models/x_clip/modeling_x_clip.py          | 42 +++++++++++--------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index fcee198a3be5a..ccfc63a844c63 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -423,8 +423,7 @@ def forward(
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
         """
-        print("Initial values of hidden states:", hidden_states[0, :3, :3])
-
+        # TODO improve variable names
         bt, l, d = hidden_states.size()
         b = bt // self.T
         msg_token = self.message_fc(hidden_states[:, 0, :])
@@ -454,8 +453,6 @@ def forward(
         hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
 
-        print("Initial values of hidden states after residual:", hidden_states[0, :3, :3])
-
         outputs = (hidden_states,)
 
         if output_attentions:
@@ -773,8 +770,6 @@ def forward(
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
 
-        print("Initial hidden states of the Text encoder:", hidden_states[0, :3, :3])
-
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
             attention_mask=attention_mask,
@@ -787,8 +782,6 @@ def forward(
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.final_layer_norm(last_hidden_state)
 
-        print("Initial values of the text final hidden states:", last_hidden_state[0, :3, :3])
-
         # text_embeds.shape = [batch_size, sequence_length, transformer.width]
         # take features from the eot embedding (eot_token is the highest number in each sequence)
         pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)]
@@ -930,8 +923,6 @@ def forward(
 
         hidden_states = inputs_embeds
         for idx, encoder_layer in enumerate(self.layers):
-            print("---------LAYER ---------", idx)
-
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             if self.gradient_checkpointing and self.training:
@@ -1102,9 +1093,13 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutput]:
+        residual = hidden_states
+
         # add position embeddings
         hidden_states = hidden_states + self.position_embedding
 
+        print("Hidden states after position embedding:", hidden_states[0,:3,:3])
+        
         # TODO support output hidden states and/or attentions
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
@@ -1114,7 +1109,7 @@ def forward(
         )
         last_hidden_state = encoder_outputs[0]
 
-        last_hidden_state = last_hidden_state.type(hidden_states.dtype) + hidden_states
+        last_hidden_state = last_hidden_state.type(hidden_states.dtype) + residual
 
         return last_hidden_state.mean(dim=1, keepdim=False)
 
@@ -1262,8 +1257,6 @@ def get_video_features(
         pooled_output = vision_outputs[1]  # pooled_output
         image_features = self.visual_projection(pooled_output)
 
-        print("Shape of image features:", image_features.shape)
-
         # TODO add the following:
         # img_features = self.prompts_visual_ln(img_features)
         # img_features = img_features @ self.prompts_visual_proj
@@ -1332,6 +1325,24 @@ def forward(
         # TODO remove this assertion (vision pooler output)
         assert torch.allclose(vision_outputs.pooler_output[0, :3], torch.tensor([-0.2987, 1.0489, 0.3702]), atol=1e-4)
 
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        cls_features = image_embeds.view(batch_size, num_frames, -1)
+
+        print("Shape of MIT input:", cls_features.shape)
+        print("Initial values of MIT input:", cls_features[0,:3,:3])
+
+        image_embeds = self.mit(cls_features)
+
+        print("Shape of output of MIT:", image_embeds.shape)
+        print("First values of output of MIT:", image_embeds[0,:3])
+
+        img_features = vision_outputs[0][:, 1:, :]
+        img_features = self.prompts_visual_layernorm(img_features)
+        img_features = img_features @ self.prompts_visual_projection
+        img_features = img_features.view(batch_size, num_frames, -1, image_embeds.shape[-1])
+
         text_outputs = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1340,10 +1351,7 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-
-        image_embeds = vision_outputs[1]
-        image_embeds = self.visual_projection(image_embeds)
-
+        
         text_embeds = text_outputs[1]
         text_embeds = self.text_projection(text_embeds)
 

From d679bd09861eb21e43459c0833dcf3261458ec79 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 23 Aug 2022 10:56:10 +0000
Subject: [PATCH 10/40] Fix quality

---
 src/transformers/models/x_clip/modeling_x_clip.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index ccfc63a844c63..2f8156a51712a 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -1098,8 +1098,6 @@ def forward(
         # add position embeddings
         hidden_states = hidden_states + self.position_embedding
 
-        print("Hidden states after position embedding:", hidden_states[0,:3,:3])
-        
         # TODO support output hidden states and/or attentions
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
@@ -1331,12 +1329,12 @@ def forward(
         cls_features = image_embeds.view(batch_size, num_frames, -1)
 
         print("Shape of MIT input:", cls_features.shape)
-        print("Initial values of MIT input:", cls_features[0,:3,:3])
+        print("Initial values of MIT input:", cls_features[0, :3, :3])
 
         image_embeds = self.mit(cls_features)
 
         print("Shape of output of MIT:", image_embeds.shape)
-        print("First values of output of MIT:", image_embeds[0,:3])
+        print("First values of output of MIT:", image_embeds[0, :3])
 
         img_features = vision_outputs[0][:, 1:, :]
         img_features = self.prompts_visual_layernorm(img_features)
@@ -1351,7 +1349,7 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        
+
         text_embeds = text_outputs[1]
         text_embeds = self.text_projection(text_embeds)
 

From 533c4e00b220a034f4b7e517e55fe42f24d7a3d7 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 23 Aug 2022 13:28:17 +0000
Subject: [PATCH 11/40] Add prompts generator

---
 .../models/x_clip/configuration_x_clip.py     | 32 ++++++-
 .../convert_x_clip_original_pytorch_to_hf.py  | 26 ++---
 .../models/x_clip/modeling_x_clip.py          | 96 +++++++++++++++++++
 3 files changed, 139 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index ad48e06f5f5cd..18ecde6c87d81 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -57,8 +57,9 @@ class XClipTextConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
-            defaults to 1e-5): The epsilon used by the layer normalization layers.
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         dropout (`float`, *optional*, defaults to 0.0):
@@ -282,8 +283,21 @@ class XClipConfig(PretrainedConfig):
             Dictionary of configuration options used to initialize [`XClipVisionConfig`].
         projection_dim (`int`, *optional*, defaults to 512):
             Dimentionality of text and vision projection layers.
+        prompt_layers (`int`, *optional*, defaults to 2):
+            Number of layers in the video specific prompt generator.
+        prompt_alpha (`float`, *optional*, defaults to 0.1):
+            Alpha value to use in the video specific prompt generator.
+        prompt_hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the video specific prompt generator. If string,
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        prompt_num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads in the cross-attention of the video specific prompt generator.
+        prompt_attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers in the video specific prompt generator.
+        prompt_projection_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the projection layers in the video specific prompt generator.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
-            The inital value of the *logit_scale* paramter. Default is used as per the original XClip implementation.
+            The inital value of the *logit_scale* parameter. Default is used as per the original XClip implementation.
         kwargs (*optional*):
             Dictionary of keyword arguments.
     """
@@ -296,6 +310,12 @@ def __init__(
         text_config_dict=None,
         vision_config_dict=None,
         projection_dim=512,
+        prompt_layers=2,
+        prompt_alpha=0.1,
+        prompt_hidden_act="quick_gelu",
+        prompt_num_attention_heads=8,
+        prompt_attention_dropout=0.0,
+        prompt_projection_dropout=0.0,
         logit_scale_init_value=2.6592,
         **kwargs
     ):
@@ -313,6 +333,12 @@ def __init__(
         self.vision_config = XClipVisionConfig(**vision_config_dict)
 
         self.projection_dim = projection_dim
+        self.prompt_layers = prompt_layers
+        self.prompt_alpha = prompt_alpha
+        self.prompt_hidden_act = prompt_hidden_act
+        self.prompt_num_attention_heads = prompt_num_attention_heads
+        self.prompt_attention_dropout = prompt_attention_dropout
+        self.prompt_projection_dropout = prompt_projection_dropout
         self.logit_scale_init_value = logit_scale_init_value
         self.initializer_factor = 1.0
 
diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index a6e1d82786ce6..4cda3a4614fa4 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -63,17 +63,19 @@ def rename_key(name):
         name = name.replace("visual.proj", "visual_projection.weight")
     if "text_projection" in name:
         name = name.replace("text_projection", "text_projection.weight")
-    # mit
-    if name == "mit.positional_embedding":
-        name = name.replace("positional", "position")
-    if name.startswith("mit.resblocks"):
-        name = name.replace("mit.resblocks", "mit.encoder.layers")
     # things on top
     if "prompts_visual_proj" in name:
         name = name.replace("prompts_visual_proj", "prompts_visual_projection")
     if "prompts_visual_ln" in name:
         name = name.replace("prompts_visual_ln", "prompts_visual_layernorm")
-    # TODO: prompts generator, mit
+    # mit
+    if name == "mit.positional_embedding":
+        name = name.replace("positional", "position")
+    if name.startswith("mit.resblocks"):
+        name = name.replace("mit.resblocks", "mit.encoder.layers")
+    # prompts generator
+    if name.startswith("prompts_generator.norm"):
+        name = name.replace("prompts_generator.norm", "prompts_generator.layernorm")
 
     return name
 
@@ -151,10 +153,6 @@ def convert_state_dict(orig_state_dict, config):
                         dim : dim * 2
                     ]
                     orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
-
-        elif key.startswith("prompts_generator"):
-            # TODO
-            pass
         else:
             new_key_name = rename_key(key)
             if new_key_name in ["visual_projection.weight", "text_projection.weight"]:
@@ -193,8 +191,12 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
     with torch.no_grad():
         outputs = model(input_ids=input_ids, pixel_values=pixel_values)
 
-    # TODO verify outputs
-    print(outputs.keys())
+    # Verify outputs
+    logits_per_image = outputs.logits_per_image
+    probs = logits_per_image.softmax(dim=1)
+    expected_probs = torch.tensor([[[0.0019], [0.9951], [0.0030]]])
+    assert torch.allclose(probs, expected_probs, atol=1e-3)
+    print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
         print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 2f8156a51712a..6dcbfdc3b53df 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -1112,6 +1112,96 @@ def forward(
         return last_hidden_state.mean(dim=1, keepdim=False)
 
 
+class XClipCrossAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_heads = config.prompt_num_attention_heads
+
+        dim = config.projection_dim
+        head_dim = dim // self.num_heads
+        self.scale = head_dim**-0.5
+
+        self.q_proj = nn.Linear(dim, dim, bias=False)
+        self.k_proj = nn.Linear(dim, dim, bias=False)
+        self.v_proj = nn.Linear(dim, dim, bias=False)
+
+        self.attn_drop = nn.Dropout(config.prompt_attention_dropout)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(config.prompt_projection_dropout)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(self, queries, keys, values):
+        """Input shape: Batch x Time x Channel"""
+        B, N, C = queries.shape
+        B, M, C = keys.shape
+        q = self.q_proj(queries).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        k = self.k_proj(keys).reshape(B, M, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        v = self.v_proj(values).reshape(B, M, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class PromptGeneratorLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        embed_dim = config.projection_dim
+        self.cross_attn = XClipCrossAttention(config)
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.norm3 = nn.LayerNorm(embed_dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(embed_dim, embed_dim * 4),
+            ACT2FN[config.prompt_hidden_act],
+            nn.Dropout(config.prompt_attention_dropout),
+            nn.Linear(embed_dim * 4, embed_dim),
+        )
+
+    def forward(self, x, visual):
+        x = x + self.cross_attn(self.norm1(x), visual, visual)
+        x = x + self.mlp(self.norm3(x))
+        return x
+
+
+class XClipPromptGenerator(nn.Module):
+    """This corresponds to the `VideoSpecificPrompt` class in the original implementation."""
+
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.projection_dim
+        self.layernorm = nn.LayerNorm(embed_dim)
+        self.decoder = nn.ModuleList([PromptGeneratorLayer(config) for _ in range(config.prompt_layers)])
+        self.alpha = nn.Parameter(torch.ones(embed_dim) * config.prompt_alpha)
+        # self.apply(self._init_weights)
+
+    # TODO do this in the init weights of XClipModel
+    # def _init_weights(self, m):
+    #     if isinstance(m, nn.Linear):
+    #         trunc_normal_(m.weight, std=.02)
+    #         if isinstance(m, nn.Linear) and m.bias is not None:
+    #             nn.init.constant_(m.bias, 0)
+    #     elif isinstance(m, nn.LayerNorm):
+    #         nn.init.constant_(m.bias, 0)
+    #         nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, text, visual):
+        visual = self.layernorm(visual)
+        for layer in self.decoder:
+            text = layer(text, visual)
+
+        return self.alpha * text
+
+
 @add_start_docstrings(X_CLIP_START_DOCSTRING)
 class XClipModel(XClipPreTrainedModel):
     config_class = XClipConfig
@@ -1155,6 +1245,8 @@ def __init__(self, config: XClipConfig):
         mit_config.num_attention_heads = vision_config.mit_num_attention_heads
         self.mit = XClipMultiframeIntegrationTransformer(mit_config)
 
+        self.prompts_generator = XClipPromptGenerator(config)
+
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1340,6 +1432,7 @@ def forward(
         img_features = self.prompts_visual_layernorm(img_features)
         img_features = img_features @ self.prompts_visual_projection
         img_features = img_features.view(batch_size, num_frames, -1, image_embeds.shape[-1])
+        img_features = img_features.mean(dim=1, keepdim=False)
 
         text_outputs = self.text_model(
             input_ids=input_ids,
@@ -1357,6 +1450,9 @@ def forward(
         assert torch.allclose(text_embeds[0, :3], torch.tensor([-0.2870, -0.3504, 0.0417]), atol=1e-4)
         print("Looks ok!")
 
+        text_embeds = text_embeds.unsqueeze(0).expand(batch_size, -1, -1)
+        text_embeds = text_embeds + self.prompts_generator(text_embeds, img_features)
+
         # normalized features
         image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

From beaae5ad0cbd8ab23dd0924c861c93c54d36668a Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 24 Aug 2022 08:38:17 +0000
Subject: [PATCH 12/40] Add tests

---
 .../models/x_clip/modeling_x_clip.py          |  13 +-
 tests/models/x_clip/test_modeling_x_clip.py   | 151 ++----------------
 2 files changed, 19 insertions(+), 145 deletions(-)

diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 6dcbfdc3b53df..2874a89bf0ebb 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -391,7 +391,7 @@ class XClipVisionEncoderLayer(nn.Module):
 
     def __init__(self, config: XClipConfig):
         super().__init__()
-        self.T = config.num_frames
+        self.num_frames = config.num_frames
 
         self.embed_dim = config.hidden_size
 
@@ -423,15 +423,14 @@ def forward(
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
         """
-        # TODO improve variable names
-        bt, l, d = hidden_states.size()
-        b = bt // self.T
+        batch_time, seq_length, hidden_size = hidden_states.size()
+        batch_size = batch_time // self.num_frames
         msg_token = self.message_fc(hidden_states[:, 0, :])
-        msg_token = msg_token.view(b, self.T, d)
+        msg_token = msg_token.view(batch_size, self.num_frames, hidden_size)
 
         msg_token = msg_token + self.drop_path(self.message_attn(self.message_ln(msg_token))[0])
         # add dummy sequence dimension
-        msg_token = msg_token.view(-1, 1, d)
+        msg_token = msg_token.view(-1, 1, hidden_size)
 
         hidden_states = torch.cat([hidden_states, msg_token], dim=1)
 
@@ -446,7 +445,7 @@ def forward(
         )
         hidden_states = residual + hidden_states
 
-        hidden_states = hidden_states[:, :l, :]
+        hidden_states = hidden_states[:, :seq_length, :]
 
         residual = hidden_states
         hidden_states = self.layer_norm2(hidden_states)
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 63716aecaa25f..6204133babe70 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -23,16 +23,8 @@
 import numpy as np
 
 import requests
-import transformers
 from transformers import XClipConfig, XClipTextConfig, XClipVisionConfig
-from transformers.testing_utils import (
-    is_flax_available,
-    is_pt_flax_cross_test,
-    require_torch,
-    require_vision,
-    slow,
-    torch_device,
-)
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -59,14 +51,6 @@
     from transformers import CLIPProcessor
 
 
-if is_flax_available():
-    import jax.numpy as jnp
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
-
-
 class XClipVisionModelTester:
     def __init__(
         self,
@@ -346,15 +330,25 @@ def test_model_from_pretrained(self):
 
 
 class XClipModelTester:
-    def __init__(self, parent, is_training=True):
+    def __init__(self, parent, num_frames=8, is_training=True):
         self.parent = parent
+        self.num_frames = num_frames
         self.text_model_tester = XClipTextModelTester(parent)
         self.vision_model_tester = XClipVisionModelTester(parent)
         self.is_training = is_training
 
     def prepare_config_and_inputs(self):
         text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+        vision_config, _ = self.vision_model_tester.prepare_config_and_inputs()
+        pixel_values = floats_tensor(
+            [
+                self.vision_model_tester.batch_size,
+                self.num_frames,
+                self.vision_model_tester.num_channels,
+                self.vision_model_tester.image_size,
+                self.vision_model_tester.image_size,
+            ]
+        )
 
         config = self.get_config()
 
@@ -510,125 +504,6 @@ def test_load_vision_text_config(self):
             text_config = XClipTextConfig.from_pretrained(tmp_dir_name)
             self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
 
-    # overwrite from common since FlaxXClipModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    return
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2)
-
-    # overwrite from common since FlaxXClipModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # load corresponding PyTorch class
-                pt_model = model_class(config).eval()
-
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    # no flax model exists for this class
-                    return
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:

From 0c0fe95b31d85be90df618317769eef2fbbbaae1 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 24 Aug 2022 09:18:07 +0000
Subject: [PATCH 13/40] Fix some tests

---
 .../models/x_clip/modeling_x_clip.py          |  3 +-
 tests/models/x_clip/test_modeling_x_clip.py   | 70 +++++++++++++++++++
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 2874a89bf0ebb..61597345848cb 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -460,7 +460,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.clip.modeling_clip.CLIPPreTrainedModel with CLIP->XClip,clip->x_clip
 class XClipPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -516,7 +515,7 @@ def _init_weights(self, module):
             module.bias.data.zero_()
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, XClipEncoder):
+        if isinstance(module, (XClipEncoder, XClipVisionEncoder)):
             module.gradient_checkpointing = value
 
 
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 6204133babe70..5453846965f2e 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -59,6 +59,7 @@ def __init__(
         image_size=30,
         patch_size=2,
         num_channels=3,
+        num_frames=6,  # important; the batch size * time must be divisible by the number of frames
         is_training=True,
         hidden_size=32,
         num_hidden_layers=5,
@@ -74,6 +75,7 @@ def __init__(
         self.image_size = image_size
         self.patch_size = patch_size
         self.num_channels = num_channels
+        self.num_frames = num_frames
         self.is_training = is_training
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
@@ -99,6 +101,7 @@ def get_config(self):
             image_size=self.image_size,
             patch_size=self.patch_size,
             num_channels=self.num_channels,
+            num_frames=self.num_frames,
             hidden_size=self.hidden_size,
             num_hidden_layers=self.num_hidden_layers,
             num_attention_heads=self.num_attention_heads,
@@ -199,6 +202,73 @@ def test_model_from_pretrained(self):
             model = XClipVisionModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    def test_gradient_checkpointing_backward_compatibility(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            if not model_class.supports_gradient_checkpointing:
+                continue
+
+            print("Model class:", model_class)
+
+            config.gradient_checkpointing = True
+            model = model_class(config)
+            self.assertTrue(model.is_gradient_checkpointing)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        # we add 1 here due to the special message token in X-CLIP's vision encoder
+        seq_len = getattr(self.model_tester, "seq_length", None) + 1
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            self.assertEqual(len(outputs.attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            self.assertEqual(len(outputs.attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(outputs.attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_seq_length],
+            )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            self.assertEqual(out_len + 1, len(outputs))
+
+            self_attentions = outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_seq_length],
+            )
+
 
 class XClipTextModelTester:
     def __init__(

From f944b4929d8f0912937739b7815b8fbddecb57de Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 24 Aug 2022 10:03:54 +0000
Subject: [PATCH 14/40] Fix some more tests

---
 .../models/x_clip/modeling_x_clip.py          | 42 +++++++++----------
 tests/models/x_clip/test_modeling_x_clip.py   | 19 ++++++---
 2 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 61597345848cb..61f819706c962 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -499,20 +499,24 @@ def _init_weights(self, module):
             nn.init.normal_(module.fc1.weight, std=fc_std)
             nn.init.normal_(module.fc2.weight, std=in_proj_std)
         elif isinstance(module, XClipModel):
+            factor = self.config.initializer_factor
             nn.init.normal_(
                 module.text_projection.weight,
-                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+                std=module.text_embed_dim**-0.5 * factor,
             )
             nn.init.normal_(
                 module.visual_projection.weight,
-                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+                std=module.vision_embed_dim**-0.5 * factor,
             )
+            nn.init.normal_(module.prompts_visual_projection, mean=0.0, std=module.vision_embed_dim**-0.5 * factor)
 
         if isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_factor)
+            if module.bias is not None:
+                module.bias.data.zero_()
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (XClipEncoder, XClipVisionEncoder)):
@@ -1093,6 +1097,9 @@ def forward(
     ) -> Union[Tuple, BaseModelOutput]:
         residual = hidden_states
 
+        print("Shape of hidden states:", hidden_states.shape)
+        print("Shape of position embedding:", self.position_embedding.data.shape)
+
         # add position embeddings
         hidden_states = hidden_states + self.position_embedding
 
@@ -1180,17 +1187,6 @@ def __init__(self, config):
         self.layernorm = nn.LayerNorm(embed_dim)
         self.decoder = nn.ModuleList([PromptGeneratorLayer(config) for _ in range(config.prompt_layers)])
         self.alpha = nn.Parameter(torch.ones(embed_dim) * config.prompt_alpha)
-        # self.apply(self._init_weights)
-
-    # TODO do this in the init weights of XClipModel
-    # def _init_weights(self, m):
-    #     if isinstance(m, nn.Linear):
-    #         trunc_normal_(m.weight, std=.02)
-    #         if isinstance(m, nn.Linear) and m.bias is not None:
-    #             nn.init.constant_(m.bias, 0)
-    #     elif isinstance(m, nn.LayerNorm):
-    #         nn.init.constant_(m.bias, 0)
-    #         nn.init.constant_(m.weight, 1.0)
 
     def forward(self, text, visual):
         visual = self.layernorm(visual)
@@ -1411,20 +1407,22 @@ def forward(
         )
 
         # TODO remove this assertion (vision pooler output)
-        assert torch.allclose(vision_outputs.pooler_output[0, :3], torch.tensor([-0.2987, 1.0489, 0.3702]), atol=1e-4)
+        # assert torch.allclose(vision_outputs.pooler_output[0, :3], torch.tensor([-0.2987, 1.0489, 0.3702]), atol=1e-4)
 
         image_embeds = vision_outputs[1]
         image_embeds = self.visual_projection(image_embeds)
 
         cls_features = image_embeds.view(batch_size, num_frames, -1)
 
-        print("Shape of MIT input:", cls_features.shape)
-        print("Initial values of MIT input:", cls_features[0, :3, :3])
+        # print("Shape of MIT input:", cls_features.shape)
+        # print("Initial values of MIT input:", cls_features[0, :3, :3])
 
         image_embeds = self.mit(cls_features)
 
-        print("Shape of output of MIT:", image_embeds.shape)
-        print("First values of output of MIT:", image_embeds[0, :3])
+        print("Shape of image embeds:", image_embeds.shape)
+
+        # print("Shape of output of MIT:", image_embeds.shape)
+        # print("First values of output of MIT:", image_embeds[0, :3])
 
         img_features = vision_outputs[0][:, 1:, :]
         img_features = self.prompts_visual_layernorm(img_features)
@@ -1445,8 +1443,8 @@ def forward(
         text_embeds = self.text_projection(text_embeds)
 
         # TODO remove this assertion (text pooler output)
-        assert torch.allclose(text_embeds[0, :3], torch.tensor([-0.2870, -0.3504, 0.0417]), atol=1e-4)
-        print("Looks ok!")
+        # assert torch.allclose(text_embeds[0, :3], torch.tensor([-0.2870, -0.3504, 0.0417]), atol=1e-4)
+        # print("Looks ok!")
 
         text_embeds = text_embeds.unsqueeze(0).expand(batch_size, -1, -1)
         text_embeds = text_embeds + self.prompts_generator(text_embeds, img_features)
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 5453846965f2e..c5a9da6dbb379 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -65,6 +65,7 @@ def __init__(
         num_hidden_layers=5,
         num_attention_heads=4,
         intermediate_size=37,
+        mit_hidden_size=64,
         dropout=0.1,
         attention_dropout=0.1,
         initializer_range=0.02,
@@ -81,6 +82,7 @@ def __init__(
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.intermediate_size = intermediate_size
+        self.mit_hidden_size = mit_hidden_size
         self.dropout = dropout
         self.attention_dropout = attention_dropout
         self.initializer_range = initializer_range
@@ -106,6 +108,7 @@ def get_config(self):
             num_hidden_layers=self.num_hidden_layers,
             num_attention_heads=self.num_attention_heads,
             intermediate_size=self.intermediate_size,
+            mit_hidden_size=self.mit_hidden_size,
             dropout=self.dropout,
             attention_dropout=self.attention_dropout,
             initializer_range=self.initializer_range,
@@ -400,9 +403,10 @@ def test_model_from_pretrained(self):
 
 
 class XClipModelTester:
-    def __init__(self, parent, num_frames=8, is_training=True):
+    def __init__(self, parent, projection_dim=64, mit_hidden_size=64, is_training=True):
         self.parent = parent
-        self.num_frames = num_frames
+        self.projection_dim = projection_dim
+        self.mit_hidden_size = mit_hidden_size
         self.text_model_tester = XClipTextModelTester(parent)
         self.vision_model_tester = XClipVisionModelTester(parent)
         self.is_training = is_training
@@ -413,7 +417,7 @@ def prepare_config_and_inputs(self):
         pixel_values = floats_tensor(
             [
                 self.vision_model_tester.batch_size,
-                self.num_frames,
+                self.vision_model_tester.num_frames,
                 self.vision_model_tester.num_channels,
                 self.vision_model_tester.image_size,
                 self.vision_model_tester.image_size,
@@ -426,7 +430,9 @@ def prepare_config_and_inputs(self):
 
     def get_config(self):
         return XClipConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+            self.text_model_tester.get_config(),
+            self.vision_model_tester.get_config(),
+            projection_dim=self.projection_dim,
         )
 
     def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
@@ -460,6 +466,7 @@ class XClipModelTest(ModelTesterMixin, unittest.TestCase):
     test_pruning = False
     test_resize_embeddings = False
     test_attention_outputs = False
+    test_torchscript = False
 
     def setUp(self):
         self.model_tester = XClipModelTester(self)
@@ -484,7 +491,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_common_attributes(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for X_CLIP
+    # override as the `logit_scale`, `prompts_generator.alpha` and `mit.position_embedding` parameters require special treatment
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -501,6 +508,8 @@ def test_initialization(self):
                             delta=1e-3,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
+                    elif name in ["prompts_generator.alpha", "mit.position_embedding"]:
+                        pass
                     else:
                         self.assertIn(
                             ((param.data.mean() * 1e9).round() / 1e9).item(),

From a77dfabf05740c4cfd3794f4e7d80879c1a58bc6 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 24 Aug 2022 10:33:44 +0000
Subject: [PATCH 15/40] Fix more tests

---
 .../x_clip/convert_x_clip_original_pytorch_to_hf.py       | 1 +
 src/transformers/models/x_clip/modeling_x_clip.py         | 6 +++++-
 tests/models/x_clip/test_modeling_x_clip.py               | 8 ++++----
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index 4cda3a4614fa4..ada5344052ee1 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -193,6 +193,7 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
 
     # Verify outputs
     logits_per_image = outputs.logits_per_image
+    print("Shape of logits per image:", logits_per_image.shape)
     probs = logits_per_image.softmax(dim=1)
     expected_probs = torch.tensor([[[0.0019], [0.9951], [0.0030]]])
     assert torch.allclose(probs, expected_probs, atol=1e-3)
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 61f819706c962..91be4344dda65 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -1230,7 +1230,7 @@ def __init__(self, config: XClipConfig):
         self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
 
         self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim)
-        self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.text_embed_dim))
+        self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.projection_dim))
 
         mit_config = copy(vision_config)
         mit_config.hidden_size = vision_config.mit_hidden_size
@@ -1442,6 +1442,8 @@ def forward(
         text_embeds = text_outputs[1]
         text_embeds = self.text_projection(text_embeds)
 
+        print("Shape of inital text embeds:", text_embeds.shape)
+
         # TODO remove this assertion (text pooler output)
         # assert torch.allclose(text_embeds[0, :3], torch.tensor([-0.2870, -0.3504, 0.0417]), atol=1e-4)
         # print("Looks ok!")
@@ -1455,6 +1457,8 @@ def forward(
 
         # cosine similarity as logits
         logit_scale = self.logit_scale.exp()
+        print("Shape of text embeds:", text_embeds.shape)
+        print("Shape of image embeds:", image_embeds.shape)
         logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
         logits_per_image = logits_per_text.T
 
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index c5a9da6dbb379..84e1a738b8b5a 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -55,11 +55,11 @@ class XClipVisionModelTester:
     def __init__(
         self,
         parent,
-        batch_size=12,
+        batch_size=16,
         image_size=30,
         patch_size=2,
         num_channels=3,
-        num_frames=6,  # important; the batch size * time must be divisible by the number of frames
+        num_frames=8,  # important; the batch size * time must be divisible by the number of frames
         is_training=True,
         hidden_size=32,
         num_hidden_layers=5,
@@ -440,10 +440,10 @@ def create_and_check_model(self, config, input_ids, attention_mask, pixel_values
         with torch.no_grad():
             result = model(input_ids, pixel_values, attention_mask)
         self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
         )
         self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+            result.logits_per_text.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
         )
 
     def prepare_config_and_inputs_for_common(self):

From adad2469e4fa2ae34f5b6b3694695e4e9df66865 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 24 Aug 2022 16:16:21 +0000
Subject: [PATCH 16/40] Improve conversion script

---
 .../convert_x_clip_original_pytorch_to_hf.py  | 11 ++++++++--
 src/transformers/models/x_clip/test.py        | 20 +++++++++++++++----
 tests/models/x_clip/test_modeling_x_clip.py   | 14 +++++++++++--
 3 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index ada5344052ee1..19d3614e6010e 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -162,7 +162,7 @@ def convert_state_dict(orig_state_dict, config):
     return orig_state_dict
 
 
-def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_path):
+def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_path=None, push_to_hub=False):
     config = get_xclip_config(model_name)
     model = XClipModel(config)
     model.eval()
@@ -203,6 +203,10 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
         print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
 
+    if push_to_hub:
+        print("Pushing to the hub...")
+        model.push_to_hub(model_name, organization="nielsr")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -222,6 +226,9 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
     )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
 
     args = parser.parse_args()
-    convert_xclip_checkpoint(args.checkpoint_url, args.model_name, args.pytorch_dump_folder_path)
+    convert_xclip_checkpoint(args.checkpoint_url, args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/x_clip/test.py b/src/transformers/models/x_clip/test.py
index b923826031edf..d50e537c137c4 100644
--- a/src/transformers/models/x_clip/test.py
+++ b/src/transformers/models/x_clip/test.py
@@ -1,8 +1,20 @@
-from transformers import XClipConfig, XClipModel
-
+from transformers import AutoTokenizer, XClipConfig, XClipModel
+from huggingface_hub import hf_hub_download
+import torch
 
 config = XClipConfig()
 model = XClipModel(config)
 
-for name, param in model.named_parameters():
-    print(name, param.shape)
+file_path = hf_hub_download(
+        repo_id="hf-internal-testing/spaghetti-video-8-frames", filename="pixel_values.pt", repo_type="dataset"
+)
+pixel_values = torch.load(file_path)
+
+tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+input_ids = tokenizer(
+    ["playing sports", "eating spaghetti", "go shopping"], padding="max_length", return_tensors="pt"
+).input_ids
+
+
+with torch.no_grad():
+    outputs = model(input_ids=input_ids, pixel_values=pixel_values)
\ No newline at end of file
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 84e1a738b8b5a..70fe27f22482c 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -440,10 +440,20 @@ def create_and_check_model(self, config, input_ids, attention_mask, pixel_values
         with torch.no_grad():
             result = model(input_ids, pixel_values, attention_mask)
         self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+            result.logits_per_image.shape,
+            (
+                self.vision_model_tester.batch_size,
+                self.text_model_tester.batch_size,
+                self.vision_model_tester.batch_size,
+            ),
         )
         self.parent.assertEqual(
-            result.logits_per_text.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+            result.logits_per_text.shape,
+            (
+                self.vision_model_tester.batch_size,
+                self.text_model_tester.batch_size,
+                self.vision_model_tester.batch_size,
+            ),
         )
 
     def prepare_config_and_inputs_for_common(self):

From 8c1b6006262cd1814922d477b80c56cb7dc9a501 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Thu, 25 Aug 2022 09:14:02 +0000
Subject: [PATCH 17/40] Fix model outputs

---
 .../convert_x_clip_original_pytorch_to_hf.py    |  3 ++-
 .../models/x_clip/modeling_x_clip.py            | 17 ++++++-----------
 src/transformers/models/x_clip/test.py          | 16 ++++++++++++----
 tests/models/x_clip/test_modeling_x_clip.py     |  2 --
 4 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index 19d3614e6010e..10f34721f71ad 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -195,7 +195,8 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
     logits_per_image = outputs.logits_per_image
     print("Shape of logits per image:", logits_per_image.shape)
     probs = logits_per_image.softmax(dim=1)
-    expected_probs = torch.tensor([[[0.0019], [0.9951], [0.0030]]])
+    print("Probs:", probs)
+    expected_probs = torch.tensor([[0.0019, 0.9951, 0.0030]])
     assert torch.allclose(probs, expected_probs, atol=1e-3)
     print("Looks ok!")
 
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 91be4344dda65..bc26acecfacfb 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -1097,9 +1097,6 @@ def forward(
     ) -> Union[Tuple, BaseModelOutput]:
         residual = hidden_states
 
-        print("Shape of hidden states:", hidden_states.shape)
-        print("Shape of position embedding:", self.position_embedding.data.shape)
-
         # add position embeddings
         hidden_states = hidden_states + self.position_embedding
 
@@ -1419,8 +1416,6 @@ def forward(
 
         image_embeds = self.mit(cls_features)
 
-        print("Shape of image embeds:", image_embeds.shape)
-
         # print("Shape of output of MIT:", image_embeds.shape)
         # print("First values of output of MIT:", image_embeds[0, :3])
 
@@ -1442,8 +1437,6 @@ def forward(
         text_embeds = text_outputs[1]
         text_embeds = self.text_projection(text_embeds)
 
-        print("Shape of inital text embeds:", text_embeds.shape)
-
         # TODO remove this assertion (text pooler output)
         # assert torch.allclose(text_embeds[0, :3], torch.tensor([-0.2870, -0.3504, 0.0417]), atol=1e-4)
         # print("Looks ok!")
@@ -1455,12 +1448,14 @@ def forward(
         image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
 
+        print("Shape of image embeds:", image_embeds.shape)
+        print("Shape of text embeds:", text_embeds.shape)
+
         # cosine similarity as logits
         logit_scale = self.logit_scale.exp()
-        print("Shape of text embeds:", text_embeds.shape)
-        print("Shape of image embeds:", image_embeds.shape)
-        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
-        logits_per_image = logits_per_text.T
+
+        logits_per_image = torch.einsum("bd,bkd->bk", image_embeds, logit_scale * text_embeds)
+        logits_per_text = logits_per_image.T
 
         loss = None
         if return_loss:
diff --git a/src/transformers/models/x_clip/test.py b/src/transformers/models/x_clip/test.py
index d50e537c137c4..6607b52046fcd 100644
--- a/src/transformers/models/x_clip/test.py
+++ b/src/transformers/models/x_clip/test.py
@@ -1,15 +1,20 @@
-from transformers import AutoTokenizer, XClipConfig, XClipModel
-from huggingface_hub import hf_hub_download
 import torch
 
+from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer, XClipConfig, XClipModel
+
+
 config = XClipConfig()
 model = XClipModel(config)
 
 file_path = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video-8-frames", filename="pixel_values.pt", repo_type="dataset"
+    repo_id="hf-internal-testing/spaghetti-video-8-frames", filename="pixel_values.pt", repo_type="dataset"
 )
 pixel_values = torch.load(file_path)
 
+pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
+print("Shape of pixel values:", pixel_values.shape)
+
 tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 input_ids = tokenizer(
     ["playing sports", "eating spaghetti", "go shopping"], padding="max_length", return_tensors="pt"
@@ -17,4 +22,7 @@
 
 
 with torch.no_grad():
-    outputs = model(input_ids=input_ids, pixel_values=pixel_values)
\ No newline at end of file
+    outputs = model(input_ids=input_ids, pixel_values=pixel_values)
+    logits_per_image = outputs.logits_per_image
+
+print("Shape of logits per image:", logits_per_image.shape)
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 70fe27f22482c..14fe9806e81de 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -444,13 +444,11 @@ def create_and_check_model(self, config, input_ids, attention_mask, pixel_values
             (
                 self.vision_model_tester.batch_size,
                 self.text_model_tester.batch_size,
-                self.vision_model_tester.batch_size,
             ),
         )
         self.parent.assertEqual(
             result.logits_per_text.shape,
             (
-                self.vision_model_tester.batch_size,
                 self.text_model_tester.batch_size,
                 self.vision_model_tester.batch_size,
             ),

From 6688cc27cf731be38fd83bf8b1170949d19b2536 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Thu, 25 Aug 2022 10:30:29 +0000
Subject: [PATCH 18/40] Fix more tests

---
 .../models/x_clip/configuration_x_clip.py         | 12 ++++++------
 src/transformers/models/x_clip/modeling_x_clip.py |  3 +--
 src/transformers/models/x_clip/test.py            |  6 ++----
 src/transformers/models/x_clip/test_clip.py       | 15 +++++++++++++++
 tests/models/x_clip/test_modeling_x_clip.py       |  5 +++--
 5 files changed, 27 insertions(+), 14 deletions(-)
 create mode 100644 src/transformers/models/x_clip/test_clip.py

diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index 18ecde6c87d81..85e7bc958d931 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -84,7 +84,7 @@ class XClipTextConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "x_clip_text_model"
+    model_type = "xclip_text_model"
 
     def __init__(
         self,
@@ -126,7 +126,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from XClipConfig
-        if config_dict.get("model_type") == "x_clip":
+        if config_dict.get("model_type") == "xclip":
             config_dict = config_dict["text_config"]
 
         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
@@ -203,7 +203,7 @@ class XClipVisionConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
 
-    model_type = "x_clip_vision_model"
+    model_type = "xclip_vision_model"
 
     def __init__(
         self,
@@ -256,7 +256,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from XClipConfig
-        if config_dict.get("model_type") == "x_clip":
+        if config_dict.get("model_type") == "xclip":
             config_dict = config_dict["vision_config"]
 
         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
@@ -302,7 +302,7 @@ class XClipConfig(PretrainedConfig):
             Dictionary of keyword arguments.
     """
 
-    model_type = "x-clip"
+    model_type = "xclip"
     is_composition = True
 
     def __init__(
@@ -345,7 +345,7 @@ def __init__(
     @classmethod
     def from_text_vision_configs(cls, text_config: XClipTextConfig, vision_config: XClipVisionConfig, **kwargs):
         r"""
-        Instantiate a [`XClipConfig`] (or a derived class) from x_clip text model configuration and x_clip vision model
+        Instantiate a [`XClipConfig`] (or a derived class) from xclip text model configuration and xclip vision model
         configuration.
 
         Returns:
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index bc26acecfacfb..415c2a7355ff2 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -1453,7 +1453,6 @@ def forward(
 
         # cosine similarity as logits
         logit_scale = self.logit_scale.exp()
-
         logits_per_image = torch.einsum("bd,bkd->bk", image_embeds, logit_scale * text_embeds)
         logits_per_text = logits_per_image.T
 
@@ -1471,6 +1470,6 @@ def forward(
             logits_per_text=logits_per_text,
             text_embeds=text_embeds,
             image_embeds=image_embeds,
-            text_model_output=text_embeds,
+            text_model_output=text_outputs,
             vision_model_output=vision_outputs,
         )
diff --git a/src/transformers/models/x_clip/test.py b/src/transformers/models/x_clip/test.py
index 6607b52046fcd..c60be1a5be2d0 100644
--- a/src/transformers/models/x_clip/test.py
+++ b/src/transformers/models/x_clip/test.py
@@ -22,7 +22,5 @@
 
 
 with torch.no_grad():
-    outputs = model(input_ids=input_ids, pixel_values=pixel_values)
-    logits_per_image = outputs.logits_per_image
-
-print("Shape of logits per image:", logits_per_image.shape)
+    outputs = model(input_ids=input_ids, pixel_values=pixel_values, return_loss=True)
+    print(outputs[0])
diff --git a/src/transformers/models/x_clip/test_clip.py b/src/transformers/models/x_clip/test_clip.py
new file mode 100644
index 0000000000000..d8f439eddd039
--- /dev/null
+++ b/src/transformers/models/x_clip/test_clip.py
@@ -0,0 +1,15 @@
+from transformers import CLIPProcessor, CLIPConfig, CLIPModel
+from PIL import Image
+import requests
+
+
+model = CLIPModel(CLIPConfig())
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+
+outputs = model(**inputs)
+print(outputs[0])
\ No newline at end of file
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 14fe9806e81de..0930376440f93 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -55,7 +55,7 @@ class XClipVisionModelTester:
     def __init__(
         self,
         parent,
-        batch_size=16,
+        batch_size=8,
         image_size=30,
         patch_size=2,
         num_channels=3,
@@ -277,7 +277,7 @@ class XClipTextModelTester:
     def __init__(
         self,
         parent,
-        batch_size=12,
+        batch_size=8,
         seq_length=7,
         is_training=True,
         use_input_mask=True,
@@ -475,6 +475,7 @@ class XClipModelTest(ModelTesterMixin, unittest.TestCase):
     test_resize_embeddings = False
     test_attention_outputs = False
     test_torchscript = False
+    maxdiff = None
 
     def setUp(self):
         self.model_tester = XClipModelTester(self)

From 07694d4619c224d70ae00a4bc1ac555117a4e228 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Thu, 25 Aug 2022 14:01:43 +0000
Subject: [PATCH 19/40] Add XClipProcessor

---
 docs/source/en/model_doc/x-clip.mdx           |   5 +
 src/transformers/__init__.py                  |  14 ++-
 .../models/auto/feature_extraction_auto.py    |   2 +-
 src/transformers/models/auto/modeling_auto.py |   2 +-
 .../models/auto/processing_auto.py            |   2 +-
 src/transformers/models/x_clip/__init__.py    |   2 +
 .../convert_x_clip_original_pytorch_to_hf.py  |  23 +++-
 .../models/x_clip/modeling_x_clip.py          |   9 +-
 .../models/x_clip/processing_x_clip.py        | 108 ++++++++++++++++++
 src/transformers/models/x_clip/test_clip.py   |   5 +-
 .../utils/dummy_vision_objects.py             |   7 --
 tests/models/x_clip/test_modeling_x_clip.py   |  46 +++++++-
 12 files changed, 195 insertions(+), 30 deletions(-)
 create mode 100644 src/transformers/models/x_clip/processing_x_clip.py

diff --git a/docs/source/en/model_doc/x-clip.mdx b/docs/source/en/model_doc/x-clip.mdx
index 2f04e33867c72..e313846425dc2 100644
--- a/docs/source/en/model_doc/x-clip.mdx
+++ b/docs/source/en/model_doc/x-clip.mdx
@@ -29,6 +29,11 @@ This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface
 The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 
+## XClipProcessor
+
+[[autodoc]] XClipProcessor
+
+
 ## XClipConfig
 
 [[autodoc]] XClipConfig
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 644ca40a7256a..d6d56bb3ba9fc 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -165,6 +165,7 @@
     "models.clip": [
         "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "CLIPConfig",
+        "CLIPProcessor",
         "CLIPTextConfig",
         "CLIPTokenizer",
         "CLIPVisionConfig",
@@ -370,6 +371,7 @@
     "models.x_clip": [
         "X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "XClipConfig",
+        "XClipProcessor",
         "XClipTextConfig",
         "XClipVisionConfig",
     ],
@@ -644,7 +646,6 @@
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].append("BeitFeatureExtractor")
     _import_structure["models.clip"].append("CLIPFeatureExtractor")
-    _import_structure["models.clip"].append("CLIPProcessor")
     _import_structure["models.convnext"].append("ConvNextFeatureExtractor")
     _import_structure["models.deit"].append("DeiTFeatureExtractor")
     _import_structure["models.detr"].append("DetrFeatureExtractor")
@@ -3003,6 +3004,7 @@
     from .models.clip import (
         CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CLIPConfig,
+        CLIPProcessor,
         CLIPTextConfig,
         CLIPTokenizer,
         CLIPVisionConfig,
@@ -3179,7 +3181,13 @@
     from .models.wav2vec2_phoneme import Wav2Vec2PhonemeCTCTokenizer
     from .models.wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
     from .models.wavlm import WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP, WavLMConfig
-    from .models.x_clip import X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, XClipConfig, XClipTextConfig, XClipVisionConfig
+    from .models.x_clip import (
+        X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XClipConfig,
+        XClipProcessor,
+        XClipTextConfig,
+        XClipVisionConfig,
+    )
     from .models.xglm import XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XGLMConfig
     from .models.xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig, XLMTokenizer
     from .models.xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig
@@ -3417,7 +3425,7 @@
     else:
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
-        from .models.clip import CLIPFeatureExtractor, CLIPProcessor
+        from .models.clip import CLIPFeatureExtractor
         from .models.convnext import ConvNextFeatureExtractor
         from .models.deit import DeiTFeatureExtractor
         from .models.detr import DetrFeatureExtractor
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 8b3a4f6afdaf0..625b79db06494 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -75,7 +75,7 @@
         ("vit_mae", "ViTFeatureExtractor"),
         ("wav2vec2", "Wav2Vec2FeatureExtractor"),
         ("wav2vec2-conformer", "Wav2Vec2FeatureExtractor"),
-        ("x-clip", "XClipFeatureExtractor"),
+        ("xclip", "CLIPFeatureExtractor"),
         ("yolos", "YolosFeatureExtractor"),
     ]
 )
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index c7b431923fb28..1c3e1e03e8518 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -137,7 +137,7 @@
         ("wav2vec2", "Wav2Vec2Model"),
         ("wav2vec2-conformer", "Wav2Vec2ConformerModel"),
         ("wavlm", "WavLMModel"),
-        ("x-clip", "XClipModel"),
+        ("xclip", "XClipModel"),
         ("xglm", "XGLMModel"),
         ("xlm", "XLMModel"),
         ("xlm-prophetnet", "XLMProphetNetModel"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index f5f1f40818848..7eff84c5d5671 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -58,7 +58,7 @@
         ("wav2vec2-conformer", "Wav2Vec2Processor"),
         ("wav2vec2_with_lm", "Wav2Vec2ProcessorWithLM"),
         ("wavlm", "Wav2Vec2Processor"),
-        ("x-clip", "XClipProcessor"),
+        ("xclip", "CLIPProcessor"),
     ]
 )
 
diff --git a/src/transformers/models/x_clip/__init__.py b/src/transformers/models/x_clip/__init__.py
index d4480f601f122..7e328b98f259b 100644
--- a/src/transformers/models/x_clip/__init__.py
+++ b/src/transformers/models/x_clip/__init__.py
@@ -27,6 +27,7 @@
         "XClipTextConfig",
         "XClipVisionConfig",
     ],
+    "processing_x_clip": ["XClipProcessor"],
 }
 
 try:
@@ -50,6 +51,7 @@
         XClipTextConfig,
         XClipVisionConfig,
     )
+    from .processing_x_clip import XClipProcessor
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index 10f34721f71ad..bad9eb0a94c10 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -18,7 +18,14 @@
 import torch
 
 from huggingface_hub import hf_hub_download
-from transformers import AutoTokenizer, XClipConfig, XClipModel
+from transformers import (
+    CLIPTokenizer,
+    CLIPTokenizerFast,
+    VideoMAEFeatureExtractor,
+    XClipConfig,
+    XClipModel,
+    XClipProcessor,
+)
 
 
 def get_xclip_config(model_name):
@@ -175,6 +182,11 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
     assert missing_keys == ["text_model.embeddings.position_ids", "vision_model.embeddings.position_ids"]
     model.eval()
 
+    feature_extractor = VideoMAEFeatureExtractor()
+    slow_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+    fast_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
+    processor = XClipProcessor(feature_extractor=feature_extractor, tokenizer=fast_tokenizer)
+
     file_path = hf_hub_download(
         repo_id="hf-internal-testing/spaghetti-video-8-frames", filename="pixel_values.pt", repo_type="dataset"
     )
@@ -183,8 +195,7 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
     # feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(model_name.replace("_", "-")))
     # inputs = feature_extractor(images=image, return_tensors="pt")
 
-    tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-    input_ids = tokenizer(
+    input_ids = fast_tokenizer(
         ["playing sports", "eating spaghetti", "go shopping"], padding="max_length", return_tensors="pt"
     ).input_ids
 
@@ -193,9 +204,7 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
 
     # Verify outputs
     logits_per_image = outputs.logits_per_image
-    print("Shape of logits per image:", logits_per_image.shape)
     probs = logits_per_image.softmax(dim=1)
-    print("Probs:", probs)
     expected_probs = torch.tensor([[0.0019, 0.9951, 0.0030]])
     assert torch.allclose(probs, expected_probs, atol=1e-3)
     print("Looks ok!")
@@ -205,8 +214,10 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
         model.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        print("Pushing to the hub...")
+        print("Pushing model, processor and slow tokenizer files to the hub...")
         model.push_to_hub(model_name, organization="nielsr")
+        processor.push_to_hub(model_name, organization="nielsr")
+        slow_tokenizer.push_to_hub(model_name, organization="nielsr")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 415c2a7355ff2..1d4ce2ed148ca 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -41,7 +41,8 @@
 _CHECKPOINT_FOR_DOC = "microsoft/xclip-base-patch32"
 
 X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "microsoft/xclip-base-patch32",
+    # TODO update to appropriate organization
+    "nielsr/xclip-base-patch32",
     # See all X-CLIP models at https://huggingface.co/models?filter=x-clip
 ]
 
@@ -1448,8 +1449,8 @@ def forward(
         image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
 
-        print("Shape of image embeds:", image_embeds.shape)
-        print("Shape of text embeds:", text_embeds.shape)
+        # print("Shape of image embeds:", image_embeds.shape)
+        # print("Shape of text embeds:", text_embeds.shape)
 
         # cosine similarity as logits
         logit_scale = self.logit_scale.exp()
@@ -1461,7 +1462,7 @@ def forward(
             loss = x_clip_loss(logits_per_text)
 
         if not return_dict:
-            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_embeds, vision_outputs)
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
             return ((loss,) + output) if loss is not None else output
 
         return XClipOutput(
diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py
new file mode 100644
index 0000000000000..e8541036a4b1c
--- /dev/null
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for XCLIP
+"""
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding
+
+
+class XClipProcessor(ProcessorMixin):
+    r"""
+    Constructs an X-CLIP processor which wraps a VideoMAE feature extractor and a CLIP tokenizer into a single
+    processor.
+
+    [`XCLIPProcessor`] offers all the functionalities of [`VideoMAEFeatureExtractor`] and [`CLIPTokenizerFast`]. See
+    the [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor ([`VideoMAEFeatureExtractor`]):
+            The feature extractor is a required input.
+        tokenizer ([`CLIPTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+    feature_extractor_class = "VideoMAEFeatureExtractor"
+    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+
+    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        VideoMAEFeatureExtractor's [`~VideoMAEFeatureExtractor.__call__`] if `images` is not `None`. Please refer to
+        the doctsring of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+
+        if images is not None:
+            image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
\ No newline at end of file
diff --git a/src/transformers/models/x_clip/test_clip.py b/src/transformers/models/x_clip/test_clip.py
index d8f439eddd039..11bc7a0c975b4 100644
--- a/src/transformers/models/x_clip/test_clip.py
+++ b/src/transformers/models/x_clip/test_clip.py
@@ -1,6 +1,7 @@
-from transformers import CLIPProcessor, CLIPConfig, CLIPModel
 from PIL import Image
+
 import requests
+from transformers import CLIPConfig, CLIPModel, CLIPProcessor
 
 
 model = CLIPModel(CLIPConfig())
@@ -12,4 +13,4 @@
 inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
 
 outputs = model(**inputs)
-print(outputs[0])
\ No newline at end of file
+print(outputs[0])
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index fa30432070a37..e1f4f3b1fd9fa 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -24,13 +24,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class CLIPProcessor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class ConvNextFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 0930376440f93..bd3276940d800 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -24,7 +24,7 @@
 
 import requests
 from transformers import XClipConfig, XClipTextConfig, XClipVisionConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -93,7 +93,9 @@ def __init__(
         self.seq_length = num_patches + 1
 
     def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        pixel_values = floats_tensor(
+            [self.batch_size * self.num_frames, self.num_channels, self.image_size, self.image_size]
+        )
         config = self.get_config()
 
         return config, pixel_values
@@ -124,8 +126,10 @@ def create_and_check_model(self, config, pixel_values):
         image_size = (self.image_size, self.image_size)
         patch_size = (self.patch_size, self.patch_size)
         num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size * self.num_frames, num_patches + 1, self.hidden_size)
+        )
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size * self.num_frames, self.hidden_size))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -272,6 +276,37 @@ def test_attention_outputs(self):
                 [self.model_tester.num_attention_heads, encoder_seq_length, encoder_seq_length],
             )
 
+    @require_torch_multi_gpu
+    def test_multi_gpu_data_parallel_forward(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # some params shouldn't be scattered by nn.DataParallel
+        # so just remove them if they are present.
+        blacklist_non_batched_params = ["head_mask", "decoder_head_mask", "cross_attn_head_mask"]
+        for k in blacklist_non_batched_params:
+            inputs_dict.pop(k, None)
+
+        # move input tensors to cuda:O
+        for k, v in inputs_dict.items():
+            if torch.is_tensor(v):
+                inputs_dict[k] = v.to(0)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=config)
+            model.to(0)
+            model.eval()
+
+            # Wrap model in nn.DataParallel
+            model = nn.DataParallel(model)
+            with torch.no_grad():
+                test = self._prepare_for_class(inputs_dict, model_class)
+                for k, v in test.items():
+                    if isinstance(v, torch.Tensor):
+                        print(k, v.shape)
+                    else:
+                        print(k, v)
+                _ = model(**self._prepare_for_class(inputs_dict, model_class))
+
 
 class XClipTextModelTester:
     def __init__(
@@ -611,7 +646,8 @@ def prepare_img():
 class XClipModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference(self):
-        model_name = "microsoft/xclip-base-patch32"
+        # TODO update organization
+        model_name = "nielsr/xclip-base-patch32"
         model = XClipModel.from_pretrained(model_name).to(torch_device)
         processor = CLIPProcessor.from_pretrained(model_name)
 

From 7949831e9f4efa0e6cf1f31603396dd84bedfa1b Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Thu, 25 Aug 2022 16:26:00 +0000
Subject: [PATCH 20/40] Use processor in conversion script

---
 .../convert_x_clip_original_pytorch_to_hf.py  | 24 ++++++++-------
 .../models/x_clip/processing_x_clip.py        | 29 ++++++++++---------
 2 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index bad9eb0a94c10..6d179c9ef5cec 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -15,6 +15,7 @@
 
 import argparse
 
+import numpy as np
 import torch
 
 from huggingface_hub import hf_hub_download
@@ -169,6 +170,14 @@ def convert_state_dict(orig_state_dict, config):
     return orig_state_dict
 
 
+def prepare_video():
+    file = hf_hub_download(
+        repo_id="datasets/hf-internal-testing/spaghetti-video", filename="eating_spaghetti_8_frames.npy"
+    )
+    video = np.load(file)
+    return list(video)
+
+
 def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_path=None, push_to_hub=False):
     config = get_xclip_config(model_name)
     model = XClipModel(config)
@@ -187,20 +196,13 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
     fast_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
     processor = XClipProcessor(feature_extractor=feature_extractor, tokenizer=fast_tokenizer)
 
-    file_path = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video-8-frames", filename="pixel_values.pt", repo_type="dataset"
+    video = prepare_video()
+    inputs = processor(
+        text=["playing sports", "eating spaghetti", "go shopping"], videos=video, return_tensors="pt", padding=True
     )
-    pixel_values = torch.load(file_path)
-
-    # feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(model_name.replace("_", "-")))
-    # inputs = feature_extractor(images=image, return_tensors="pt")
-
-    input_ids = fast_tokenizer(
-        ["playing sports", "eating spaghetti", "go shopping"], padding="max_length", return_tensors="pt"
-    ).input_ids
 
     with torch.no_grad():
-        outputs = model(input_ids=input_ids, pixel_values=pixel_values)
+        outputs = model(**inputs)
 
     # Verify outputs
     logits_per_image = outputs.logits_per_image
diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py
index e8541036a4b1c..06932f093d6f5 100644
--- a/src/transformers/models/x_clip/processing_x_clip.py
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@@ -40,12 +40,12 @@ def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
         self.current_processor = self.feature_extractor
 
-    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+    def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        VideoMAEFeatureExtractor's [`~VideoMAEFeatureExtractor.__call__`] if `images` is not `None`. Please refer to
+        the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
+        VideoMAEFeatureExtractor's [`~VideoMAEFeatureExtractor.__call__`] if `videos` is not `None`. Please refer to
         the doctsring of the above two methods for more information.
 
         Args:
@@ -53,10 +53,11 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
-                number of channels, H and W are image height and width.
+            videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarrray]]`,:
+                `List[List[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
+                of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
+                each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
+                channels.
 
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
@@ -73,19 +74,19 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
               `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `videos` is not `None`.
         """
 
-        if text is None and images is None:
-            raise ValueError("You have to specify either text or images. Both cannot be none.")
+        if text is None and videos is None:
+            raise ValueError("You have to specify either text or videos. Both cannot be none.")
 
         if text is not None:
             encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
 
-        if images is not None:
-            image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
+        if videos is not None:
+            image_features = self.feature_extractor(videos, return_tensors=return_tensors, **kwargs)
 
-        if text is not None and images is not None:
+        if text is not None and videos is not None:
             encoding["pixel_values"] = image_features.pixel_values
             return encoding
         elif text is not None:
@@ -105,4 +106,4 @@ def decode(self, *args, **kwargs):
         This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
         the docstring of this method for more information.
         """
-        return self.tokenizer.decode(*args, **kwargs)
\ No newline at end of file
+        return self.tokenizer.decode(*args, **kwargs)

From 4f0aee7e222a3411e95ddb206c31a77ccce938d7 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Thu, 25 Aug 2022 16:59:53 +0000
Subject: [PATCH 21/40] Fix integration test

---
 tests/models/x_clip/test_modeling_x_clip.py | 26 ++++++++++-----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index bd3276940d800..89697d24c372d 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -22,7 +22,7 @@
 
 import numpy as np
 
-import requests
+from huggingface_hub import hf_hub_download
 from transformers import XClipConfig, XClipTextConfig, XClipVisionConfig
 from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
@@ -46,9 +46,7 @@
 
 
 if is_vision_available():
-    from PIL import Image
-
-    from transformers import CLIPProcessor
+    from transformers import XClipProcessor
 
 
 class XClipVisionModelTester:
@@ -634,11 +632,13 @@ def test_model_from_pretrained(self):
             self.assertIsNotNone(model)
 
 
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
+# We will verify our results on a spaghetti video
+def prepare_video():
+    file = hf_hub_download(
+        repo_id="datasets/hf-internal-testing/spaghetti-video", filename="eating_spaghetti_8_frames.npy"
+    )
+    video = np.load(file)
+    return list(video)
 
 
 @require_vision
@@ -649,11 +649,11 @@ def test_inference(self):
         # TODO update organization
         model_name = "nielsr/xclip-base-patch32"
         model = XClipModel.from_pretrained(model_name).to(torch_device)
-        processor = CLIPProcessor.from_pretrained(model_name)
+        processor = XClipProcessor.from_pretrained(model_name)
 
-        image = prepare_img()
+        video = prepare_video()
         inputs = processor(
-            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt"
+            text=["playing sports", "eating spaghetti", "go shopping"], videos=video, return_tensors="pt", padding=True
         ).to(torch_device)
 
         # forward pass
@@ -670,6 +670,6 @@ def test_inference(self):
             torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
         )
 
-        expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)
+        expected_logits = torch.tensor([[14.3819, 20.6031, 15.0526]], device=torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))

From 99491710cc6d335644f2108a657a7f339ee145be Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Fri, 26 Aug 2022 08:10:30 +0000
Subject: [PATCH 22/40] Update README, fix docs

---
 README.md                                     |  2 +-
 README_ko.md                                  |  2 +-
 README_zh-hans.md                             |  2 +-
 README_zh-hant.md                             |  2 +-
 docs/source/en/index.mdx                      |  6 +-
 docs/source/en/model_doc/x-clip.mdx           | 65 -------------------
 docs/source/en/model_doc/xclip.mdx            | 64 ++++++++++++++++++
 .../models/auto/configuration_auto.py         |  6 +-
 tests/models/x_clip/test_modeling_x_clip.py   |  4 ++
 9 files changed, 78 insertions(+), 75 deletions(-)
 delete mode 100644 docs/source/en/model_doc/x-clip.mdx
 create mode 100644 docs/source/en/model_doc/xclip.mdx

diff --git a/README.md b/README.md
index 097d934f016b4..6b5b95f135668 100644
--- a/README.md
+++ b/README.md
@@ -382,7 +382,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[X-XClip](https://huggingface.co/docs/transformers/main/model_doc/x-clip)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[X-CLIP](https://huggingface.co/docs/transformers/main/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
diff --git a/README_ko.md b/README_ko.md
index 8d2f9a3c51e1c..67add828e4104 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -334,7 +334,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[X-XClip](https://huggingface.co/docs/transformers/main/model_doc/x-clip)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[X-CLIP](https://huggingface.co/docs/transformers/main/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index f6b4d1b71e798..4d93d04b7702d 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -358,7 +358,7 @@ conda install -c huggingface transformers
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (来自 Facebook AI) 伴随论文 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 发布。
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (来自 Facebook AI) 伴随论文 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 由 Qiantong Xu, Alexei Baevski, Michael Auli 发布。
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[X-XClip](https://huggingface.co/docs/transformers/main/model_doc/x-clip)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[X-CLIP](https://huggingface.co/docs/transformers/main/model_doc/xclip)** (来自 Microsoft Research) 伴随论文 [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) 由 Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling 发布。
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。
 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index b12a945661d53..297478f702b28 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -370,7 +370,7 @@ conda install -c huggingface transformers
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[X-XClip](https://huggingface.co/docs/transformers/main/model_doc/x-clip)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[X-CLIP](https://huggingface.co/docs/transformers/main/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index f3da3efa34c9c..4e86bf1718248 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -176,7 +176,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[X-XClip](model_doc/x-clip)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[X-CLIP](model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
 1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
@@ -313,8 +313,8 @@ Flax), PyTorch, and/or TensorFlow.
 |          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 |     Wav2Vec2-Conformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           X-XClip           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            XGLM             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           X-CLIP            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            XGLM             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
 |             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |       XLM-ProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
diff --git a/docs/source/en/model_doc/x-clip.mdx b/docs/source/en/model_doc/x-clip.mdx
deleted file mode 100644
index e313846425dc2..0000000000000
--- a/docs/source/en/model_doc/x-clip.mdx
+++ /dev/null
@@ -1,65 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# X-CLIP
-
-## Overview
-
-The X-CLIP model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
-
-The abstract from the paper is the following:
-
-*<INSERT PAPER ABSTRACT HERE>*
-
-Tips:
-
-<INSERT TIPS ABOUT MODEL HERE>
-
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
-
-
-## XClipProcessor
-
-[[autodoc]] XClipProcessor
-
-
-## XClipConfig
-
-[[autodoc]] XClipConfig
-    - from_text_vision_configs
-
-## XClipTextConfig
-
-[[autodoc]] XClipTextConfig
-
-## XClipVisionConfig
-
-[[autodoc]] XClipVisionConfig
-
-## XClipModel
-
-[[autodoc]] XClipModel
-    - forward
-    - get_text_features
-    - get_image_features
-
-## XClipTextModel
-
-[[autodoc]] XClipTextModel
-    - forward
-
-## XClipVisionModel
-
-[[autodoc]] XClipVisionModel
-    - forward
diff --git a/docs/source/en/model_doc/xclip.mdx b/docs/source/en/model_doc/xclip.mdx
new file mode 100644
index 0000000000000..e788f958f6c36
--- /dev/null
+++ b/docs/source/en/model_doc/xclip.mdx
@@ -0,0 +1,64 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# X-CLIP
+
+## Overview
+
+The X-CLIP model was proposed in [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
+X-CLIP is a minimal extension of [CLIP](clip) for video. The model consists of a text encoder, a cross-frame vision encoder, a multi-frame integration Transformer, and a video-specific prompt generator.
+
+The abstract from the paper is the following:
+
+*Contrastive language-image pretraining has shown great success in learning visual-textual joint representation from web-scale data, demonstrating remarkable "zero-shot" generalization ability for various image tasks. However, how to effectively expand such new language-image pretraining methods to video domains is still an open problem. In this work, we present a simple yet effective approach that adapts the pretrained language-image models to video recognition directly, instead of pretraining a new model from scratch. More concretely, to capture the long-range dependencies of frames along the temporal dimension, we propose a cross-frame attention mechanism that explicitly exchanges information across frames. Such module is lightweight and can be plugged into pretrained language-image models seamlessly. Moreover, we propose a video-specific prompting scheme, which leverages video content information for generating discriminative textual prompts. Extensive experiments demonstrate that our approach is effective and can be generalized to different video recognition scenarios. In particular, under fully-supervised settings, our approach achieves a top-1 accuracy of 87.1% on Kinectics-400, while using 12 times fewer FLOPs compared with Swin-L and ViViT-H. In zero-shot experiments, our approach surpasses the current state-of-the-art methods by +7.6% and +14.9% in terms of top-1 accuracy under two popular protocols. In few-shot scenarios, our approach outperforms previous best methods by +32.1% and +23.1% when the labeled data is extremely limited.*
+
+Tips:
+
+- Usage of X-CLIP is identical to CLIP.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/microsoft/VideoX/tree/master/X-CLIP).
+
+
+## XClipProcessor
+
+[[autodoc]] XClipProcessor
+
+## XClipConfig
+
+[[autodoc]] XClipConfig
+    - from_text_vision_configs
+
+## XClipTextConfig
+
+[[autodoc]] XClipTextConfig
+
+## XClipVisionConfig
+
+[[autodoc]] XClipVisionConfig
+
+## XClipModel
+
+[[autodoc]] XClipModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## XClipTextModel
+
+[[autodoc]] XClipTextModel
+    - forward
+
+## XClipVisionModel
+
+[[autodoc]] XClipVisionModel
+    - forward
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index aaa9d3dcf8068..0ad6b5199920f 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -143,7 +143,7 @@
         ("wav2vec2", "Wav2Vec2Config"),
         ("wav2vec2-conformer", "Wav2Vec2ConformerConfig"),
         ("wavlm", "WavLMConfig"),
-        ("x-clip", "XClipConfig"),
+        ("xclip", "XClipConfig"),
         ("xglm", "XGLMConfig"),
         ("xlm", "XLMConfig"),
         ("xlm-prophetnet", "XLMProphetNetConfig"),
@@ -258,7 +258,7 @@
         ("vit_mae", "VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("wav2vec2", "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("wav2vec2-conformer", "WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("x-clip", "X_XClip_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("xclip", "X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xglm", "XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xlm", "XLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xlm-prophetnet", "XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -407,7 +407,7 @@
         ("wav2vec2-conformer", "Wav2Vec2-Conformer"),
         ("wav2vec2_phoneme", "Wav2Vec2Phoneme"),
         ("wavlm", "WavLM"),
-        ("x-clip", "X-XClip"),
+        ("xclip", "X-CLIP"),
         ("xglm", "XGLM"),
         ("xlm", "XLM"),
         ("xlm-prophetnet", "XLM-ProphetNet"),
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 89697d24c372d..a8f3b3370dddf 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -533,6 +533,10 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_common_attributes(self):
         pass
 
+    @unittest.skip(reason="XClipModel does not support feedforward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
     # override as the `logit_scale`, `prompts_generator.alpha` and `mit.position_embedding` parameters require special treatment
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

From 5c448e1f2176f2bc0d358f4f83c2cad86d9f8c22 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Fri, 26 Aug 2022 10:04:45 +0000
Subject: [PATCH 23/40] Fix all tests

---
 docs/source/en/_toctree.yml                       |  2 ++
 src/transformers/models/x_clip/modeling_x_clip.py | 12 ++----------
 tests/models/x_clip/test_modeling_x_clip.py       |  6 +++---
 3 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 78137d2c8a74c..80d47d6d209e0 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -455,6 +455,8 @@
         title: Vision Text Dual Encoder
       - local: model_doc/visual_bert
         title: VisualBERT
+      - local: model_doc/xclip
+        title: X-CLIP
       title: Multimodal models
     - isExpanded: false
       sections:
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 1d4ce2ed148ca..770fd522de14e 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -510,6 +510,8 @@ def _init_weights(self, module):
                 std=module.vision_embed_dim**-0.5 * factor,
             )
             nn.init.normal_(module.prompts_visual_projection, mean=0.0, std=module.vision_embed_dim**-0.5 * factor)
+        elif isinstance(module, XClipMultiframeIntegrationTransformer):
+            nn.init.normal_(module.position_embedding, std=self.config.initializer_factor)
 
         if isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
@@ -1412,14 +1414,8 @@ def forward(
 
         cls_features = image_embeds.view(batch_size, num_frames, -1)
 
-        # print("Shape of MIT input:", cls_features.shape)
-        # print("Initial values of MIT input:", cls_features[0, :3, :3])
-
         image_embeds = self.mit(cls_features)
 
-        # print("Shape of output of MIT:", image_embeds.shape)
-        # print("First values of output of MIT:", image_embeds[0, :3])
-
         img_features = vision_outputs[0][:, 1:, :]
         img_features = self.prompts_visual_layernorm(img_features)
         img_features = img_features @ self.prompts_visual_projection
@@ -1440,7 +1436,6 @@ def forward(
 
         # TODO remove this assertion (text pooler output)
         # assert torch.allclose(text_embeds[0, :3], torch.tensor([-0.2870, -0.3504, 0.0417]), atol=1e-4)
-        # print("Looks ok!")
 
         text_embeds = text_embeds.unsqueeze(0).expand(batch_size, -1, -1)
         text_embeds = text_embeds + self.prompts_generator(text_embeds, img_features)
@@ -1449,9 +1444,6 @@ def forward(
         image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
 
-        # print("Shape of image embeds:", image_embeds.shape)
-        # print("Shape of text embeds:", text_embeds.shape)
-
         # cosine similarity as logits
         logit_scale = self.logit_scale.exp()
         logits_per_image = torch.einsum("bd,bkd->bk", image_embeds, logit_scale * text_embeds)
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index a8f3b3370dddf..0aee76e1064ec 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -537,7 +537,7 @@ def test_model_common_attributes(self):
     def test_feed_forward_chunking(self):
         pass
 
-    # override as the `logit_scale`, `prompts_generator.alpha` and `mit.position_embedding` parameters require special treatment
+    # override as the `logit_scale`, `prompts_generator.alpha` parameters require special treatment
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -554,8 +554,8 @@ def test_initialization(self):
                             delta=1e-3,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
-                    elif name in ["prompts_generator.alpha", "mit.position_embedding"]:
-                        pass
+                    elif name == "prompts_generator.alpha":
+                        self.assertAlmostEqual(param.data.mean().item(), model.config.prompt_alpha)
                     else:
                         self.assertIn(
                             ((param.data.mean() * 1e9).round() / 1e9).item(),

From 252ff54138b13e6932314cd79dc0b2a0b73ab571 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Fri, 26 Aug 2022 10:21:47 +0000
Subject: [PATCH 24/40] Add MIT output to XClipOutput

---
 .../models/x_clip/modeling_x_clip.py          | 36 ++++++++++++++-----
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 770fd522de14e..5afe635bcc895 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -76,7 +76,6 @@ def x_clip_loss(similarity: torch.Tensor) -> torch.Tensor:
 
 
 @dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->XClip
 class XClipOutput(ModelOutput):
     """
     Args:
@@ -93,10 +92,12 @@ class XClipOutput(ModelOutput):
         image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
             The image embeddings obtained by applying the projection layer to the pooled output of
             [`XClipVisionModel`].
-        text_model_output(`BaseModelOutputWithPooling`):
+        text_model_output (`BaseModelOutputWithPooling`):
             The output of the [`XClipTextModel`].
-        vision_model_output(`BaseModelOutputWithPooling`):
+        vision_model_output (`BaseModelOutputWithPooling`):
             The output of the [`XClipVisionModel`].
+        mit_output (`BaseModelOutputWithPooling`):
+            The output of `XClipMultiframeIntegrationTransformer` (MIT for short).
     """
 
     loss: Optional[torch.FloatTensor] = None
@@ -106,10 +107,13 @@ class XClipOutput(ModelOutput):
     image_embeds: torch.FloatTensor = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
+    mit_output: BaseModelOutputWithPooling = None
 
     def to_tuple(self) -> Tuple[Any]:
         return tuple(
-            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            self[k]
+            if k not in ["text_model_output", "vision_model_output", "mit_output"]
+            else getattr(self, k).to_tuple()
             for k in self.keys()
         )
 
@@ -1092,8 +1096,6 @@ def __init__(self, config: XClipVisionConfig):
     def forward(
         self,
         hidden_states,
-        attention_mask: Optional[torch.Tensor] = None,
-        causal_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1103,7 +1105,6 @@ def forward(
         # add position embeddings
         hidden_states = hidden_states + self.position_embedding
 
-        # TODO support output hidden states and/or attentions
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
             output_attentions=output_attentions,
@@ -1114,7 +1115,17 @@ def forward(
 
         last_hidden_state = last_hidden_state.type(hidden_states.dtype) + residual
 
-        return last_hidden_state.mean(dim=1, keepdim=False)
+        pooled_output = last_hidden_state.mean(dim=1, keepdim=False)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
 
 
 class XClipCrossAttention(nn.Module):
@@ -1414,7 +1425,13 @@ def forward(
 
         cls_features = image_embeds.view(batch_size, num_frames, -1)
 
-        image_embeds = self.mit(cls_features)
+        mit_outputs = self.mit(
+            cls_features,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = mit_outputs[1]
 
         img_features = vision_outputs[0][:, 1:, :]
         img_features = self.prompts_visual_layernorm(img_features)
@@ -1465,4 +1482,5 @@ def forward(
             image_embeds=image_embeds,
             text_model_output=text_outputs,
             vision_model_output=vision_outputs,
+            mit_output=mit_outputs,
         )

From 043704d4ba341edf81807b755c8a948e6d483205 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Fri, 26 Aug 2022 10:32:06 +0000
Subject: [PATCH 25/40] Create better variable names

---
 .../models/x_clip/modeling_x_clip.py          | 44 +++++++++----------
 tests/models/x_clip/test_modeling_x_clip.py   |  6 +--
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 5afe635bcc895..52e8d58a68064 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -80,17 +80,17 @@ class XClipOutput(ModelOutput):
     """
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
-            Contrastive loss for image-text similarity.
-        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
-            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            Contrastive loss for video-text similarity.
+        logits_per_video (`torch.FloatTensor` of shape `(video_batch_size, text_batch_size)`):
+            The scaled dot product scores between `video_embeds` and `text_embeds`. This represents the video-text
             similarity scores.
-        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
-            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, video_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `video_embeds`. This represents the text-video
             similarity scores.
         text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
             The text embeddings obtained by applying the projection layer to the pooled output of [`XClipTextModel`].
-        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to the pooled output of
+        video_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The video embeddings obtained by applying the projection layer to the pooled output of
             [`XClipVisionModel`].
         text_model_output (`BaseModelOutputWithPooling`):
             The output of the [`XClipTextModel`].
@@ -101,10 +101,10 @@ class XClipOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
+    logits_per_video: torch.FloatTensor = None
     logits_per_text: torch.FloatTensor = None
     text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
+    video_embeds: torch.FloatTensor = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
     mit_output: BaseModelOutputWithPooling = None
@@ -1397,8 +1397,8 @@ def forward(
         ... )
 
         >>> outputs = model(**inputs)
-        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
-        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        >>> logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
+        >>> probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
         ```"""
         # Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1420,10 +1420,10 @@ def forward(
         # TODO remove this assertion (vision pooler output)
         # assert torch.allclose(vision_outputs.pooler_output[0, :3], torch.tensor([-0.2987, 1.0489, 0.3702]), atol=1e-4)
 
-        image_embeds = vision_outputs[1]
-        image_embeds = self.visual_projection(image_embeds)
+        video_embeds = vision_outputs[1]
+        video_embeds = self.visual_projection(video_embeds)
 
-        cls_features = image_embeds.view(batch_size, num_frames, -1)
+        cls_features = video_embeds.view(batch_size, num_frames, -1)
 
         mit_outputs = self.mit(
             cls_features,
@@ -1431,12 +1431,12 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        image_embeds = mit_outputs[1]
+        video_embeds = mit_outputs[1]
 
         img_features = vision_outputs[0][:, 1:, :]
         img_features = self.prompts_visual_layernorm(img_features)
         img_features = img_features @ self.prompts_visual_projection
-        img_features = img_features.view(batch_size, num_frames, -1, image_embeds.shape[-1])
+        img_features = img_features.view(batch_size, num_frames, -1, video_embeds.shape[-1])
         img_features = img_features.mean(dim=1, keepdim=False)
 
         text_outputs = self.text_model(
@@ -1458,28 +1458,28 @@ def forward(
         text_embeds = text_embeds + self.prompts_generator(text_embeds, img_features)
 
         # normalized features
-        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        video_embeds = video_embeds / video_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
 
         # cosine similarity as logits
         logit_scale = self.logit_scale.exp()
-        logits_per_image = torch.einsum("bd,bkd->bk", image_embeds, logit_scale * text_embeds)
-        logits_per_text = logits_per_image.T
+        logits_per_video = torch.einsum("bd,bkd->bk", video_embeds, logit_scale * text_embeds)
+        logits_per_text = logits_per_video.T
 
         loss = None
         if return_loss:
             loss = x_clip_loss(logits_per_text)
 
         if not return_dict:
-            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            output = (logits_per_video, logits_per_text, text_embeds, video_embeds, text_outputs, vision_outputs)
             return ((loss,) + output) if loss is not None else output
 
         return XClipOutput(
             loss=loss,
-            logits_per_image=logits_per_image,
+            logits_per_video=logits_per_video,
             logits_per_text=logits_per_text,
             text_embeds=text_embeds,
-            image_embeds=image_embeds,
+            video_embeds=video_embeds,
             text_model_output=text_outputs,
             vision_model_output=vision_outputs,
             mit_output=mit_outputs,
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 0aee76e1064ec..f8b6f2ba2dbd5 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -473,7 +473,7 @@ def create_and_check_model(self, config, input_ids, attention_mask, pixel_values
         with torch.no_grad():
             result = model(input_ids, pixel_values, attention_mask)
         self.parent.assertEqual(
-            result.logits_per_image.shape,
+            result.logits_per_video.shape,
             (
                 self.vision_model_tester.batch_size,
                 self.text_model_tester.batch_size,
@@ -666,7 +666,7 @@ def test_inference(self):
 
         # verify the logits
         self.assertEqual(
-            outputs.logits_per_image.shape,
+            outputs.logits_per_video.shape,
             torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
         )
         self.assertEqual(
@@ -676,4 +676,4 @@ def test_inference(self):
 
         expected_logits = torch.tensor([[14.3819, 20.6031, 15.0526]], device=torch_device)
 
-        self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
+        self.assertTrue(torch.allclose(outputs.logits_per_video, expected_logits, atol=1e-3))

From 39b20498f90b702f46e9f35fb1d966c836856a39 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Thu, 1 Sep 2022 10:14:24 +0000
Subject: [PATCH 26/40] Rename XClip to XCLIP

---
 docs/source/en/index.mdx                      |   2 +-
 docs/source/en/model_doc/xclip.mdx            |  28 +--
 src/transformers/__init__.py                  |  32 +--
 .../models/auto/configuration_auto.py         |   2 +-
 src/transformers/models/auto/modeling_auto.py |   2 +-
 src/transformers/models/x_clip/__init__.py    |  32 +--
 .../models/x_clip/configuration_x_clip.py     |  68 +++---
 .../convert_x_clip_original_pytorch_to_hf.py  |  18 +-
 .../models/x_clip/modeling_x_clip.py          | 208 +++++++++---------
 .../models/x_clip/processing_x_clip.py        |   2 +-
 src/transformers/models/x_clip/test.py        |   6 +-
 src/transformers/models/x_clip/test_clip.py   |  16 --
 src/transformers/utils/dummy_pt_objects.py    |   8 +-
 tests/models/x_clip/test_modeling_x_clip.py   |  84 +++----
 utils/check_config_docstrings.py              |   2 +-
 utils/check_repo.py                           |   4 +-
 16 files changed, 249 insertions(+), 265 deletions(-)
 delete mode 100644 src/transformers/models/x_clip/test_clip.py

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 4e86bf1718248..3d9578487c380 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -314,7 +314,7 @@ Flax), PyTorch, and/or TensorFlow.
 |     Wav2Vec2-Conformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           X-CLIP            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            XGLM             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|            XGLM             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |       XLM-ProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
diff --git a/docs/source/en/model_doc/xclip.mdx b/docs/source/en/model_doc/xclip.mdx
index e788f958f6c36..fbdbf2da44a49 100644
--- a/docs/source/en/model_doc/xclip.mdx
+++ b/docs/source/en/model_doc/xclip.mdx
@@ -29,36 +29,36 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/microsoft/VideoX/tree/master/X-CLIP).
 
 
-## XClipProcessor
+## XCLIPProcessor
 
-[[autodoc]] XClipProcessor
+[[autodoc]] XCLIPProcessor
 
-## XClipConfig
+## XCLIPConfig
 
-[[autodoc]] XClipConfig
+[[autodoc]] XCLIPConfig
     - from_text_vision_configs
 
-## XClipTextConfig
+## XCLIPTextConfig
 
-[[autodoc]] XClipTextConfig
+[[autodoc]] XCLIPTextConfig
 
-## XClipVisionConfig
+## XCLIPVisionConfig
 
-[[autodoc]] XClipVisionConfig
+[[autodoc]] XCLIPVisionConfig
 
-## XClipModel
+## XCLIPModel
 
-[[autodoc]] XClipModel
+[[autodoc]] XCLIPModel
     - forward
     - get_text_features
     - get_image_features
 
-## XClipTextModel
+## XCLIPTextModel
 
-[[autodoc]] XClipTextModel
+[[autodoc]] XCLIPTextModel
     - forward
 
-## XClipVisionModel
+## XCLIPVisionModel
 
-[[autodoc]] XClipVisionModel
+[[autodoc]] XCLIPVisionModel
     - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d6d56bb3ba9fc..5dc10550c15f3 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -370,10 +370,10 @@
     ],
     "models.x_clip": [
         "X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "XClipConfig",
-        "XClipProcessor",
-        "XClipTextConfig",
-        "XClipVisionConfig",
+        "XCLIPConfig",
+        "XCLIPProcessor",
+        "XCLIPTextConfig",
+        "XCLIPVisionConfig",
     ],
     "models.xglm": ["XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "XGLMConfig"],
     "models.xlm": ["XLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMConfig", "XLMTokenizer"],
@@ -993,10 +993,10 @@
     _import_structure["models.x_clip"].extend(
         [
             "X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "XClipModel",
-            "XClipPreTrainedModel",
-            "XClipTextModel",
-            "XClipVisionModel",
+            "XCLIPModel",
+            "XCLIPPreTrainedModel",
+            "XCLIPTextModel",
+            "XCLIPVisionModel",
         ]
     )
     _import_structure["models.convbert"].extend(
@@ -3183,10 +3183,10 @@
     from .models.wavlm import WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP, WavLMConfig
     from .models.x_clip import (
         X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        XClipConfig,
-        XClipProcessor,
-        XClipTextConfig,
-        XClipVisionConfig,
+        XCLIPConfig,
+        XCLIPProcessor,
+        XCLIPTextConfig,
+        XCLIPVisionConfig,
     )
     from .models.xglm import XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XGLMConfig
     from .models.xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig, XLMTokenizer
@@ -4490,10 +4490,10 @@
         )
         from .models.x_clip import (
             X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-            XClipModel,
-            XClipPreTrainedModel,
-            XClipTextModel,
-            XClipVisionModel,
+            XCLIPModel,
+            XCLIPPreTrainedModel,
+            XCLIPTextModel,
+            XCLIPVisionModel,
         )
         from .models.xglm import XGLM_PRETRAINED_MODEL_ARCHIVE_LIST, XGLMForCausalLM, XGLMModel, XGLMPreTrainedModel
         from .models.xlm import (
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 0ad6b5199920f..b01eb4bd9bb91 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -143,7 +143,7 @@
         ("wav2vec2", "Wav2Vec2Config"),
         ("wav2vec2-conformer", "Wav2Vec2ConformerConfig"),
         ("wavlm", "WavLMConfig"),
-        ("xclip", "XClipConfig"),
+        ("xclip", "XCLIPConfig"),
         ("xglm", "XGLMConfig"),
         ("xlm", "XLMConfig"),
         ("xlm-prophetnet", "XLMProphetNetConfig"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 1c3e1e03e8518..ea27d9ee87d9f 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -137,7 +137,7 @@
         ("wav2vec2", "Wav2Vec2Model"),
         ("wav2vec2-conformer", "Wav2Vec2ConformerModel"),
         ("wavlm", "WavLMModel"),
-        ("xclip", "XClipModel"),
+        ("xclip", "XCLIPModel"),
         ("xglm", "XGLMModel"),
         ("xlm", "XLMModel"),
         ("xlm-prophetnet", "XLMProphetNetModel"),
diff --git a/src/transformers/models/x_clip/__init__.py b/src/transformers/models/x_clip/__init__.py
index 7e328b98f259b..613d2903824a6 100644
--- a/src/transformers/models/x_clip/__init__.py
+++ b/src/transformers/models/x_clip/__init__.py
@@ -23,11 +23,11 @@
 _import_structure = {
     "configuration_x_clip": [
         "X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "XClipConfig",
-        "XClipTextConfig",
-        "XClipVisionConfig",
+        "XCLIPConfig",
+        "XCLIPTextConfig",
+        "XCLIPVisionConfig",
     ],
-    "processing_x_clip": ["XClipProcessor"],
+    "processing_x_clip": ["XCLIPProcessor"],
 }
 
 try:
@@ -38,20 +38,20 @@
 else:
     _import_structure["modeling_x_clip"] = [
         "X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "XClipModel",
-        "XClipPreTrainedModel",
-        "XClipTextModel",
-        "XClipVisionModel",
+        "XCLIPModel",
+        "XCLIPPreTrainedModel",
+        "XCLIPTextModel",
+        "XCLIPVisionModel",
     ]
 
 if TYPE_CHECKING:
     from .configuration_x_clip import (
         X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        XClipConfig,
-        XClipTextConfig,
-        XClipVisionConfig,
+        XCLIPConfig,
+        XCLIPTextConfig,
+        XCLIPVisionConfig,
     )
-    from .processing_x_clip import XClipProcessor
+    from .processing_x_clip import XCLIPProcessor
 
     try:
         if not is_torch_available():
@@ -61,10 +61,10 @@
     else:
         from .modeling_x_clip import (
             X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-            XClipModel,
-            XClipPreTrainedModel,
-            XClipTextModel,
-            XClipVisionModel,
+            XCLIPModel,
+            XCLIPPreTrainedModel,
+            XCLIPTextModel,
+            XCLIPVisionModel,
         )
 
 else:
diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index 85e7bc958d931..f9dc06590d6df 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" XClip model configuration"""
+""" X-CLIP model configuration"""
 
 import copy
 import os
@@ -29,11 +29,11 @@
 }
 
 
-class XClipTextConfig(PretrainedConfig):
+class XCLIPTextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`XClipModel`]. It is used to instantiate an XClip
+    This is the configuration class to store the configuration of a [`XCLIPModel`]. It is used to instantiate an X-CLIP
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the XClip
+    defaults will yield a similar configuration to that of the X-CLIP
     [microsoft/xclip-base-patch32](https://huggingface.co/microsoft/xclip-base-patch32) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -42,8 +42,8 @@ class XClipTextConfig(PretrainedConfig):
 
     Args:
         vocab_size (`int`, *optional*, defaults to 49408):
-            Vocabulary size of the XClip text model. Defines the number of different tokens that can be represented by
-            the `inputs_ids` passed when calling [`XClipModel`].
+            Vocabulary size of the X-CLIP text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`XCLIPModel`].
         hidden_size (`int`, *optional*, defaults to 512):
             Dimensionality of the encoder layers and the pooler layer.
         intermediate_size (`int`, *optional*, defaults to 2048):
@@ -73,13 +73,13 @@ class XClipTextConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import XClipTextModel, XClipTextConfig
+    >>> from transformers import XCLIPTextModel, XCLIPTextConfig
 
-    >>> # Initializing a XClipTextModel with microsoft/xclip-base-patch32 style configuration
-    >>> configuration = XClipTextConfig()
+    >>> # Initializing a XCLIPTextModel with microsoft/xclip-base-patch32 style configuration
+    >>> configuration = XCLIPTextConfig()
 
-    >>> # Initializing a XClipTextConfig from the microsoft/xclip-base-patch32 style configuration
-    >>> model = XClipTextModel(configuration)
+    >>> # Initializing a XCLIPTextConfig from the microsoft/xclip-base-patch32 style configuration
+    >>> model = XCLIPTextModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
@@ -125,7 +125,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
-        # get the text config dict if we are loading from XClipConfig
+        # get the text config dict if we are loading from XCLIPConfig
         if config_dict.get("model_type") == "xclip":
             config_dict = config_dict["text_config"]
 
@@ -138,11 +138,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         return cls.from_dict(config_dict, **kwargs)
 
 
-class XClipVisionConfig(PretrainedConfig):
+class XCLIPVisionConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`XClipModel`]. It is used to instantiate an XClip
+    This is the configuration class to store the configuration of a [`XCLIPModel`]. It is used to instantiate an X-CLIP
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the XClip
+    defaults will yield a similar configuration to that of the X-CLIP
     [microsoft/xclip-base-patch32](https://huggingface.co/microsoft/xclip-base-patch32) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -191,13 +191,13 @@ class XClipVisionConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import XClipVisionModel, XClipVisionConfig
+    >>> from transformers import XCLIPVisionModel, XCLIPVisionConfig
 
-    >>> # Initializing a XClipVisionModel with microsoft/xclip-base-patch32 style configuration
-    >>> configuration = XClipVisionConfig()
+    >>> # Initializing a XCLIPVisionModel with microsoft/xclip-base-patch32 style configuration
+    >>> configuration = XCLIPVisionConfig()
 
-    >>> # Initializing a XClipVisionModel model from the microsoft/xclip-base-patch32 style configuration
-    >>> model = XClipVisionModel(configuration)
+    >>> # Initializing a XCLIPVisionModel model from the microsoft/xclip-base-patch32 style configuration
+    >>> model = XCLIPVisionModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
@@ -255,7 +255,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
-        # get the vision config dict if we are loading from XClipConfig
+        # get the vision config dict if we are loading from XCLIPConfig
         if config_dict.get("model_type") == "xclip":
             config_dict = config_dict["vision_config"]
 
@@ -268,19 +268,19 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         return cls.from_dict(config_dict, **kwargs)
 
 
-class XClipConfig(PretrainedConfig):
+class XCLIPConfig(PretrainedConfig):
     r"""
-    [`XClipConfig`] is the configuration class to store the configuration of a [`XClipModel`]. It is used to
-    instantiate XClip model according to the specified arguments, defining the text model and vision model configs.
+    [`XCLIPConfig`] is the configuration class to store the configuration of a [`XCLIPModel`]. It is used to
+    instantiate X-CLIP model according to the specified arguments, defining the text model and vision model configs.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
         text_config_dict (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`XClipTextConfig`].
+            Dictionary of configuration options used to initialize [`XCLIPTextConfig`].
         vision_config_dict (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`XClipVisionConfig`].
+            Dictionary of configuration options used to initialize [`XCLIPVisionConfig`].
         projection_dim (`int`, *optional*, defaults to 512):
             Dimentionality of text and vision projection layers.
         prompt_layers (`int`, *optional*, defaults to 2):
@@ -297,7 +297,7 @@ class XClipConfig(PretrainedConfig):
         prompt_projection_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for the projection layers in the video specific prompt generator.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
-            The inital value of the *logit_scale* parameter. Default is used as per the original XClip implementation.
+            The inital value of the *logit_scale* parameter. Default is used as per the original XCLIP implementation.
         kwargs (*optional*):
             Dictionary of keyword arguments.
     """
@@ -323,14 +323,14 @@ def __init__(
 
         if text_config_dict is None:
             text_config_dict = {}
-            logger.info("text_config_dict is None. Initializing the XClipTextConfig with default values.")
+            logger.info("text_config_dict is None. Initializing the XCLIPTextConfig with default values.")
 
         if vision_config_dict is None:
             vision_config_dict = {}
-            logger.info("vision_config_dict is None. initializing the XClipVisionConfig with default values.")
+            logger.info("vision_config_dict is None. initializing the XCLIPVisionConfig with default values.")
 
-        self.text_config = XClipTextConfig(**text_config_dict)
-        self.vision_config = XClipVisionConfig(**vision_config_dict)
+        self.text_config = XCLIPTextConfig(**text_config_dict)
+        self.vision_config = XCLIPVisionConfig(**vision_config_dict)
 
         self.projection_dim = projection_dim
         self.prompt_layers = prompt_layers
@@ -343,13 +343,13 @@ def __init__(
         self.initializer_factor = 1.0
 
     @classmethod
-    def from_text_vision_configs(cls, text_config: XClipTextConfig, vision_config: XClipVisionConfig, **kwargs):
+    def from_text_vision_configs(cls, text_config: XCLIPTextConfig, vision_config: XCLIPVisionConfig, **kwargs):
         r"""
-        Instantiate a [`XClipConfig`] (or a derived class) from xclip text model configuration and xclip vision model
+        Instantiate a [`XCLIPConfig`] (or a derived class) from xclip text model configuration and xclip vision model
         configuration.
 
         Returns:
-            [`XClipConfig`]: An instance of a configuration object
+            [`XCLIPConfig`]: An instance of a configuration object
         """
 
         return cls(text_config_dict=text_config.to_dict(), vision_config_dict=vision_config.to_dict(), **kwargs)
diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index 6d179c9ef5cec..63fdebeb5a60c 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -23,14 +23,14 @@
     CLIPTokenizer,
     CLIPTokenizerFast,
     VideoMAEFeatureExtractor,
-    XClipConfig,
-    XClipModel,
-    XClipProcessor,
+    XCLIPConfig,
+    XCLIPModel,
+    XCLIPProcessor,
 )
 
 
 def get_xclip_config(model_name):
-    config = XClipConfig()
+    config = XCLIPConfig()
     return config
 
 
@@ -180,13 +180,13 @@ def prepare_video():
 
 def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_path=None, push_to_hub=False):
     config = get_xclip_config(model_name)
-    model = XClipModel(config)
+    model = XCLIPModel(config)
     model.eval()
 
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]
     state_dict = convert_state_dict(state_dict, config)
 
-    model = XClipModel(config)
+    model = XCLIPModel(config)
     missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
     assert missing_keys == ["text_model.embeddings.position_ids", "vision_model.embeddings.position_ids"]
     model.eval()
@@ -194,7 +194,7 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
     feature_extractor = VideoMAEFeatureExtractor()
     slow_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
     fast_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
-    processor = XClipProcessor(feature_extractor=feature_extractor, tokenizer=fast_tokenizer)
+    processor = XCLIPProcessor(feature_extractor=feature_extractor, tokenizer=fast_tokenizer)
 
     video = prepare_video()
     inputs = processor(
@@ -205,8 +205,8 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
         outputs = model(**inputs)
 
     # Verify outputs
-    logits_per_image = outputs.logits_per_image
-    probs = logits_per_image.softmax(dim=1)
+    logits_per_video = outputs.logits_per_video
+    probs = logits_per_video.softmax(dim=1)
     expected_probs = torch.tensor([[0.0019, 0.9951, 0.0030]])
     assert torch.allclose(probs, expected_probs, atol=1e-3)
     print("Looks ok!")
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 52e8d58a68064..21ee1c16c4817 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -33,7 +33,7 @@
     logging,
     replace_return_docstrings,
 )
-from .configuration_x_clip import XClipConfig, XClipTextConfig, XClipVisionConfig
+from .configuration_x_clip import XCLIPConfig, XCLIPTextConfig, XCLIPVisionConfig
 
 
 logger = logging.get_logger(__name__)
@@ -63,7 +63,7 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 
 
 # contrastive loss function, adapted from
-# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/XClip.html
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/clip.html
 def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
     return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
 
@@ -76,7 +76,7 @@ def x_clip_loss(similarity: torch.Tensor) -> torch.Tensor:
 
 
 @dataclass
-class XClipOutput(ModelOutput):
+class XCLIPOutput(ModelOutput):
     """
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
@@ -88,16 +88,16 @@ class XClipOutput(ModelOutput):
             The scaled dot product scores between `text_embeds` and `video_embeds`. This represents the text-video
             similarity scores.
         text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The text embeddings obtained by applying the projection layer to the pooled output of [`XClipTextModel`].
+            The text embeddings obtained by applying the projection layer to the pooled output of [`XCLIPTextModel`].
         video_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
             The video embeddings obtained by applying the projection layer to the pooled output of
-            [`XClipVisionModel`].
+            [`XCLIPVisionModel`].
         text_model_output (`BaseModelOutputWithPooling`):
-            The output of the [`XClipTextModel`].
+            The output of the [`XCLIPTextModel`].
         vision_model_output (`BaseModelOutputWithPooling`):
-            The output of the [`XClipVisionModel`].
+            The output of the [`XCLIPVisionModel`].
         mit_output (`BaseModelOutputWithPooling`):
-            The output of `XClipMultiframeIntegrationTransformer` (MIT for short).
+            The output of `XCLIPMultiframeIntegrationTransformer` (MIT for short).
     """
 
     loss: Optional[torch.FloatTensor] = None
@@ -118,9 +118,9 @@ def to_tuple(self) -> Tuple[Any]:
         )
 
 
-# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->XClip
-class XClipVisionEmbeddings(nn.Module):
-    def __init__(self, config: XClipVisionConfig):
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->XCLIP
+class XCLIPVisionEmbeddings(nn.Module):
+    def __init__(self, config: XCLIPVisionConfig):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -149,9 +149,9 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         return embeddings
 
 
-# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->XClip
-class XClipTextEmbeddings(nn.Module):
-    def __init__(self, config: XClipTextConfig):
+# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->XCLIP
+class XCLIPTextEmbeddings(nn.Module):
+    def __init__(self, config: XCLIPTextConfig):
         super().__init__()
         embed_dim = config.hidden_size
 
@@ -181,8 +181,8 @@ def forward(
         return embeddings
 
 
-# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->XClip
-class XClipAttention(nn.Module):
+# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->XCLIP
+class XCLIPAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(self, config):
@@ -286,8 +286,8 @@ def forward(
         return attn_output, attn_weights_reshaped
 
 
-# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->XClip
-class XClipMLP(nn.Module):
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->XCLIP
+class XCLIPMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -302,14 +302,14 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->XClip
-class XClipEncoderLayer(nn.Module):
-    def __init__(self, config: XClipConfig):
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->XCLIP
+class XCLIPEncoderLayer(nn.Module):
+    def __init__(self, config: XCLIPConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
-        self.self_attn = XClipAttention(config)
+        self.self_attn = XCLIPAttention(config)
         self.layer_norm1 = nn.LayerNorm(self.embed_dim)
-        self.mlp = XClipMLP(config)
+        self.mlp = XCLIPMLP(config)
         self.layer_norm2 = nn.LayerNorm(self.embed_dim)
 
     def forward(
@@ -374,8 +374,8 @@ def drop_path(input, drop_prob: float = 0.0, training: bool = False):
     return output
 
 
-# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->XClip
-class XClipDropPath(nn.Module):
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->XCLIP
+class XCLIPDropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
 
     def __init__(self, drop_prob: Optional[float] = None) -> None:
@@ -389,12 +389,12 @@ def extra_repr(self) -> str:
         return "p={}".format(self.drop_prob)
 
 
-class XClipVisionEncoderLayer(nn.Module):
+class XCLIPVisionEncoderLayer(nn.Module):
     """
     This corresponds to the `CrossFramelAttentionBlock` class in the original implementation.
     """
 
-    def __init__(self, config: XClipConfig):
+    def __init__(self, config: XCLIPConfig):
         super().__init__()
         self.num_frames = config.num_frames
 
@@ -402,13 +402,13 @@ def __init__(self, config: XClipConfig):
 
         self.message_fc = nn.Linear(self.embed_dim, self.embed_dim)
         self.message_ln = nn.LayerNorm(self.embed_dim)
-        self.message_attn = XClipAttention(config)
+        self.message_attn = XCLIPAttention(config)
 
-        self.drop_path = XClipDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        self.drop_path = XCLIPDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
 
-        self.self_attn = XClipAttention(config)
+        self.self_attn = XCLIPAttention(config)
         self.layer_norm1 = nn.LayerNorm(self.embed_dim)
-        self.mlp = XClipMLP(config)
+        self.mlp = XCLIPMLP(config)
         self.layer_norm2 = nn.LayerNorm(self.embed_dim)
 
     def forward(
@@ -465,13 +465,13 @@ def forward(
         return outputs
 
 
-class XClipPreTrainedModel(PreTrainedModel):
+class XCLIPPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
 
-    config_class = XClipConfig
+    config_class = XCLIPConfig
     base_model_prefix = "x_clip"
     supports_gradient_checkpointing = True
     _keys_to_ignore_on_load_missing = [r"position_ids"]
@@ -479,15 +479,15 @@ class XClipPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         """Initialize the weights"""
         factor = self.config.initializer_factor
-        if isinstance(module, XClipTextEmbeddings):
+        if isinstance(module, XCLIPTextEmbeddings):
             module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
             module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
-        elif isinstance(module, XClipVisionEmbeddings):
+        elif isinstance(module, XCLIPVisionEmbeddings):
             factor = self.config.initializer_factor
             nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
             nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
             nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
-        elif isinstance(module, XClipAttention):
+        elif isinstance(module, XCLIPAttention):
             factor = self.config.initializer_factor
             in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
             out_proj_std = (module.embed_dim**-0.5) * factor
@@ -495,7 +495,7 @@ def _init_weights(self, module):
             nn.init.normal_(module.k_proj.weight, std=in_proj_std)
             nn.init.normal_(module.v_proj.weight, std=in_proj_std)
             nn.init.normal_(module.out_proj.weight, std=out_proj_std)
-        elif isinstance(module, XClipMLP):
+        elif isinstance(module, XCLIPMLP):
             factor = self.config.initializer_factor
             in_proj_std = (
                 (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
@@ -503,7 +503,7 @@ def _init_weights(self, module):
             fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
             nn.init.normal_(module.fc1.weight, std=fc_std)
             nn.init.normal_(module.fc2.weight, std=in_proj_std)
-        elif isinstance(module, XClipModel):
+        elif isinstance(module, XCLIPModel):
             factor = self.config.initializer_factor
             nn.init.normal_(
                 module.text_projection.weight,
@@ -514,7 +514,7 @@ def _init_weights(self, module):
                 std=module.vision_embed_dim**-0.5 * factor,
             )
             nn.init.normal_(module.prompts_visual_projection, mean=0.0, std=module.vision_embed_dim**-0.5 * factor)
-        elif isinstance(module, XClipMultiframeIntegrationTransformer):
+        elif isinstance(module, XCLIPMultiframeIntegrationTransformer):
             nn.init.normal_(module.position_embedding, std=self.config.initializer_factor)
 
         if isinstance(module, nn.LayerNorm):
@@ -526,7 +526,7 @@ def _init_weights(self, module):
                 module.bias.data.zero_()
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (XClipEncoder, XClipVisionEncoder)):
+        if isinstance(module, (XCLIPEncoder, XCLIPVisionEncoder)):
             module.gradient_checkpointing = value
 
 
@@ -536,7 +536,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     behavior.
 
     Parameters:
-        config ([`XClipConfig`]): Model configuration class with all the parameters of the model.
+        config ([`XCLIPConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
@@ -626,20 +626,20 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->XClip
-class XClipEncoder(nn.Module):
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->XCLIP
+class XCLIPEncoder(nn.Module):
     """
     Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
-    [`XClipEncoderLayer`].
+    [`XCLIPEncoderLayer`].
 
     Args:
-        config: XClipConfig
+        config: XCLIPConfig
     """
 
-    def __init__(self, config: XClipConfig):
+    def __init__(self, config: XCLIPConfig):
         super().__init__()
         self.config = config
-        self.layers = nn.ModuleList([XClipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layers = nn.ModuleList([XCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
     def forward(
@@ -730,17 +730,17 @@ def custom_forward(*inputs):
         )
 
 
-class XClipTextTransformer(nn.Module):
-    def __init__(self, config: XClipTextConfig):
+class XCLIPTextTransformer(nn.Module):
+    def __init__(self, config: XCLIPTextConfig):
         super().__init__()
         self.config = config
         embed_dim = config.hidden_size
-        self.embeddings = XClipTextEmbeddings(config)
-        self.encoder = XClipEncoder(config)
+        self.embeddings = XCLIPTextEmbeddings(config)
+        self.encoder = XCLIPEncoder(config)
         self.final_layer_norm = nn.LayerNorm(embed_dim)
 
     @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XClipTextConfig)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig)
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -815,12 +815,12 @@ def _build_causal_attention_mask(self, bsz, seq_len, dtype):
         return mask
 
 
-class XClipTextModel(XClipPreTrainedModel):
-    config_class = XClipTextConfig
+class XCLIPTextModel(XCLIPPreTrainedModel):
+    config_class = XCLIPTextConfig
 
-    def __init__(self, config: XClipTextConfig):
+    def __init__(self, config: XCLIPTextConfig):
         super().__init__(config)
-        self.text_model = XClipTextTransformer(config)
+        self.text_model = XCLIPTextTransformer(config)
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -831,7 +831,7 @@ def set_input_embeddings(self, value):
         self.text_model.embeddings.token_embedding = value
 
     @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XClipTextConfig)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig)
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -847,9 +847,9 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import CLIPTokenizer, XClipTextModel
+        >>> from transformers import CLIPTokenizer, XCLIPTextModel
 
-        >>> model = XClipTextModel.from_pretrained("microsoft/xclip-base-patch32")
+        >>> model = XCLIPTextModel.from_pretrained("microsoft/xclip-base-patch32")
         >>> tokenizer = CLIPTokenizer.from_pretrained("microsoft/xclip-base-patch32")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
@@ -868,19 +868,19 @@ def forward(
         )
 
 
-class XClipVisionEncoder(nn.Module):
+class XCLIPVisionEncoder(nn.Module):
     """
     Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
-    [`XClipVisionEncoderLayer`].
+    [`XCLIPVisionEncoderLayer`].
 
     Args:
-        config: XClipConfig
+        config: XCLIPConfig
     """
 
-    def __init__(self, config: XClipConfig):
+    def __init__(self, config: XCLIPConfig):
         super().__init__()
         self.config = config
-        self.layers = nn.ModuleList([XClipVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layers = nn.ModuleList([XCLIPVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
     def forward(
@@ -971,23 +971,23 @@ def custom_forward(*inputs):
         )
 
 
-class XClipVisionTransformer(nn.Module):
+class XCLIPVisionTransformer(nn.Module):
     """
     This corresponds to the `CrossFrameCommunicationTransformer` class in the original implementation.
     """
 
-    def __init__(self, config: XClipVisionConfig):
+    def __init__(self, config: XCLIPVisionConfig):
         super().__init__()
         self.config = config
         embed_dim = config.hidden_size
 
-        self.embeddings = XClipVisionEmbeddings(config)
+        self.embeddings = XCLIPVisionEmbeddings(config)
         self.pre_layernorm = nn.LayerNorm(embed_dim)
-        self.encoder = XClipVisionEncoder(config)
+        self.encoder = XCLIPVisionEncoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim)
 
     @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XClipVisionConfig)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig)
     def forward(
         self,
         pixel_values: torch.FloatTensor,
@@ -1030,13 +1030,13 @@ def forward(
         )
 
 
-class XClipVisionModel(XClipPreTrainedModel):
-    config_class = XClipVisionConfig
+class XCLIPVisionModel(XCLIPPreTrainedModel):
+    config_class = XCLIPVisionConfig
     main_input_name = "pixel_values"
 
-    def __init__(self, config: XClipVisionConfig):
+    def __init__(self, config: XCLIPVisionConfig):
         super().__init__(config)
-        self.vision_model = XClipVisionTransformer(config)
+        self.vision_model = XCLIPVisionTransformer(config)
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1044,7 +1044,7 @@ def get_input_embeddings(self) -> nn.Module:
         return self.vision_model.embeddings.patch_embedding
 
     @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XClipVisionConfig)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig)
     def forward(
         self,
         pixel_values: Optional[torch.FloatTensor] = None,
@@ -1060,9 +1060,9 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, XClipVisionModel
+        >>> from transformers import CLIPProcessor, XCLIPVisionModel
 
-        >>> model = XClipVisionModel.from_pretrained("microsoft/xclip-base-patch32")
+        >>> model = XCLIPVisionModel.from_pretrained("microsoft/xclip-base-patch32")
         >>> processor = CLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -1082,16 +1082,16 @@ def forward(
         )
 
 
-class XClipMultiframeIntegrationTransformer(nn.Module):
+class XCLIPMultiframeIntegrationTransformer(nn.Module):
     """
     This corresponds to the `MultiframeIntegrationTransformer` class in the original implementation.
     """
 
-    def __init__(self, config: XClipVisionConfig):
+    def __init__(self, config: XCLIPVisionConfig):
         super().__init__()
 
         self.position_embedding = nn.Parameter(torch.empty(1, config.num_frames, config.hidden_size))
-        self.encoder = XClipEncoder(config)
+        self.encoder = XCLIPEncoder(config)
 
     def forward(
         self,
@@ -1128,7 +1128,7 @@ def forward(
         )
 
 
-class XClipCrossAttention(nn.Module):
+class XCLIPCrossAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(self, config):
@@ -1173,7 +1173,7 @@ def __init__(self, config):
         super().__init__()
 
         embed_dim = config.projection_dim
-        self.cross_attn = XClipCrossAttention(config)
+        self.cross_attn = XCLIPCrossAttention(config)
         self.norm1 = nn.LayerNorm(embed_dim)
         self.norm3 = nn.LayerNorm(embed_dim)
         self.mlp = nn.Sequential(
@@ -1189,7 +1189,7 @@ def forward(self, x, visual):
         return x
 
 
-class XClipPromptGenerator(nn.Module):
+class XCLIPPromptGenerator(nn.Module):
     """This corresponds to the `VideoSpecificPrompt` class in the original implementation."""
 
     def __init__(self, config):
@@ -1208,21 +1208,21 @@ def forward(self, text, visual):
 
 
 @add_start_docstrings(X_CLIP_START_DOCSTRING)
-class XClipModel(XClipPreTrainedModel):
-    config_class = XClipConfig
+class XCLIPModel(XCLIPPreTrainedModel):
+    config_class = XCLIPConfig
 
-    def __init__(self, config: XClipConfig):
+    def __init__(self, config: XCLIPConfig):
         super().__init__(config)
 
-        if not isinstance(config.text_config, XClipTextConfig):
+        if not isinstance(config.text_config, XCLIPTextConfig):
             raise ValueError(
-                "config.text_config is expected to be of type XClipTextConfig but is of type"
+                "config.text_config is expected to be of type XCLIPTextConfig but is of type"
                 f" {type(config.text_config)}."
             )
 
-        if not isinstance(config.vision_config, XClipVisionConfig):
+        if not isinstance(config.vision_config, XCLIPVisionConfig):
             raise ValueError(
-                "config.vision_config is expected to be of type XClipVisionConfig but is of type"
+                "config.vision_config is expected to be of type XCLIPVisionConfig but is of type"
                 f" {type(config.vision_config)}."
             )
 
@@ -1233,8 +1233,8 @@ def __init__(self, config: XClipConfig):
         self.text_embed_dim = text_config.hidden_size
         self.vision_embed_dim = vision_config.hidden_size
 
-        self.text_model = XClipTextTransformer(text_config)
-        self.vision_model = XClipVisionTransformer(vision_config)
+        self.text_model = XCLIPTextTransformer(text_config)
+        self.vision_model = XCLIPVisionTransformer(vision_config)
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
@@ -1248,9 +1248,9 @@ def __init__(self, config: XClipConfig):
         mit_config.intermediate_size = vision_config.mit_intermediate_size
         mit_config.num_hidden_layers = vision_config.mit_num_hidden_layers
         mit_config.num_attention_heads = vision_config.mit_num_attention_heads
-        self.mit = XClipMultiframeIntegrationTransformer(mit_config)
+        self.mit = XCLIPMultiframeIntegrationTransformer(mit_config)
 
-        self.prompts_generator = XClipPromptGenerator(config)
+        self.prompts_generator = XCLIPPromptGenerator(config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1268,14 +1268,14 @@ def get_text_features(
         r"""
         Returns:
             text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
-            applying the projection layer to the pooled output of [`XClipTextModel`].
+            applying the projection layer to the pooled output of [`XCLIPTextModel`].
 
         Examples:
 
         ```python
-        >>> from transformers import CLIPTokenizer, XClipModel
+        >>> from transformers import CLIPTokenizer, XCLIPModel
 
-        >>> model = XClipModel.from_pretrained("microsoft/xclip-base-patch32")
+        >>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
         >>> tokenizer = CLIPTokenizer.from_pretrained("microsoft/xclip-base-patch32")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
@@ -1313,16 +1313,16 @@ def get_video_features(
         r"""
         Returns:
             image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
-            applying the projection layer to the pooled output of [`XClipVisionModel`].
+            applying the projection layer to the pooled output of [`XCLIPVisionModel`].
 
         Examples:
 
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, XClipModel
+        >>> from transformers import CLIPProcessor, XCLIPModel
 
-        >>> model = XClipModel.from_pretrained("microsoft/xclip-base-patch32")
+        >>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
         >>> processor = CLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -1364,7 +1364,7 @@ def get_video_features(
         return image_features
 
     @add_start_docstrings_to_model_forward(X_CLIP_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=XClipOutput, config_class=XClipConfig)
+    @replace_return_docstrings(output_type=XCLIPOutput, config_class=XCLIPConfig)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -1375,7 +1375,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, XClipOutput]:
+    ) -> Union[Tuple, XCLIPOutput]:
         r"""
         Returns:
 
@@ -1384,9 +1384,9 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, XClipModel
+        >>> from transformers import CLIPProcessor, XCLIPModel
 
-        >>> model = XClipModel.from_pretrained("microsoft/xclip-base-patch32")
+        >>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
         >>> processor = CLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -1474,7 +1474,7 @@ def forward(
             output = (logits_per_video, logits_per_text, text_embeds, video_embeds, text_outputs, vision_outputs)
             return ((loss,) + output) if loss is not None else output
 
-        return XClipOutput(
+        return XCLIPOutput(
             loss=loss,
             logits_per_video=logits_per_video,
             logits_per_text=logits_per_text,
diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py
index 06932f093d6f5..7e694a3e339ec 100644
--- a/src/transformers/models/x_clip/processing_x_clip.py
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@@ -19,7 +19,7 @@
 from ...tokenization_utils_base import BatchEncoding
 
 
-class XClipProcessor(ProcessorMixin):
+class XCLIPProcessor(ProcessorMixin):
     r"""
     Constructs an X-CLIP processor which wraps a VideoMAE feature extractor and a CLIP tokenizer into a single
     processor.
diff --git a/src/transformers/models/x_clip/test.py b/src/transformers/models/x_clip/test.py
index c60be1a5be2d0..bdadbd39a02e4 100644
--- a/src/transformers/models/x_clip/test.py
+++ b/src/transformers/models/x_clip/test.py
@@ -1,11 +1,11 @@
 import torch
 
 from huggingface_hub import hf_hub_download
-from transformers import AutoTokenizer, XClipConfig, XClipModel
+from transformers import AutoTokenizer, XCLIPConfig, XCLIPModel
 
 
-config = XClipConfig()
-model = XClipModel(config)
+config = XCLIPConfig()
+model = XCLIPModel(config)
 
 file_path = hf_hub_download(
     repo_id="hf-internal-testing/spaghetti-video-8-frames", filename="pixel_values.pt", repo_type="dataset"
diff --git a/src/transformers/models/x_clip/test_clip.py b/src/transformers/models/x_clip/test_clip.py
deleted file mode 100644
index 11bc7a0c975b4..0000000000000
--- a/src/transformers/models/x_clip/test_clip.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from PIL import Image
-
-import requests
-from transformers import CLIPConfig, CLIPModel, CLIPProcessor
-
-
-model = CLIPModel(CLIPConfig())
-processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-
-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
-
-inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
-
-outputs = model(**inputs)
-print(outputs[0])
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index ff16d25ec585e..1c9b7531932e9 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5171,28 +5171,28 @@ def __init__(self, *args, **kwargs):
 X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class XClipModel(metaclass=DummyObject):
+class XCLIPModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class XClipPreTrainedModel(metaclass=DummyObject):
+class XCLIPPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class XClipTextModel(metaclass=DummyObject):
+class XCLIPTextModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class XClipVisionModel(metaclass=DummyObject):
+class XCLIPVisionModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index f8b6f2ba2dbd5..b14895e89a04f 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch XClip model. """
+""" Testing suite for the PyTorch XCLIP model. """
 
 
 import inspect
@@ -23,7 +23,7 @@
 import numpy as np
 
 from huggingface_hub import hf_hub_download
-from transformers import XClipConfig, XClipTextConfig, XClipVisionConfig
+from transformers import XCLIPConfig, XCLIPTextConfig, XCLIPVisionConfig
 from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -41,15 +41,15 @@
     import torch
     from torch import nn
 
-    from transformers import XClipModel, XClipTextModel, XClipVisionModel
+    from transformers import XCLIPModel, XCLIPTextModel, XCLIPVisionModel
     from transformers.models.x_clip.modeling_x_clip import X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
 if is_vision_available():
-    from transformers import XClipProcessor
+    from transformers import XCLIPProcessor
 
 
-class XClipVisionModelTester:
+class XCLIPVisionModelTester:
     def __init__(
         self,
         parent,
@@ -99,7 +99,7 @@ def prepare_config_and_inputs(self):
         return config, pixel_values
 
     def get_config(self):
-        return XClipVisionConfig(
+        return XCLIPVisionConfig(
             image_size=self.image_size,
             patch_size=self.patch_size,
             num_channels=self.num_channels,
@@ -115,7 +115,7 @@ def get_config(self):
         )
 
     def create_and_check_model(self, config, pixel_values):
-        model = XClipVisionModel(config=config)
+        model = XCLIPVisionModel(config=config)
         model.to(torch_device)
         model.eval()
         with torch.no_grad():
@@ -137,22 +137,22 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class XClipVisionModelTest(ModelTesterMixin, unittest.TestCase):
+class XCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as X_CLIP does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
-    all_model_classes = (XClipVisionModel,) if is_torch_available() else ()
+    all_model_classes = (XCLIPVisionModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False
 
     def setUp(self):
-        self.model_tester = XClipVisionModelTester(self)
+        self.model_tester = XCLIPVisionModelTester(self)
         self.config_tester = ConfigTester(
-            self, config_class=XClipVisionConfig, has_text_modality=False, hidden_size=37
+            self, config_class=XCLIPVisionConfig, has_text_modality=False, hidden_size=37
         )
 
     def test_config(self):
@@ -193,18 +193,18 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(reason="XClipVisionModel has no base class and is not available in MODEL_MAPPING")
+    @unittest.skip(reason="XCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip(reason="XClipVisionModel has no base class and is not available in MODEL_MAPPING")
+    @unittest.skip(reason="XCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_to_base(self):
         pass
 
     @slow
     def test_model_from_pretrained(self):
         for model_name in X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = XClipVisionModel.from_pretrained(model_name)
+            model = XCLIPVisionModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
     def test_gradient_checkpointing_backward_compatibility(self):
@@ -306,7 +306,7 @@ def test_multi_gpu_data_parallel_forward(self):
                 _ = model(**self._prepare_for_class(inputs_dict, model_class))
 
 
-class XClipTextModelTester:
+class XCLIPTextModelTester:
     def __init__(
         self,
         parent,
@@ -362,7 +362,7 @@ def prepare_config_and_inputs(self):
         return config, input_ids, input_mask
 
     def get_config(self):
-        return XClipTextConfig(
+        return XCLIPTextConfig(
             vocab_size=self.vocab_size,
             hidden_size=self.hidden_size,
             num_hidden_layers=self.num_hidden_layers,
@@ -375,7 +375,7 @@ def get_config(self):
         )
 
     def create_and_check_model(self, config, input_ids, input_mask):
-        model = XClipTextModel(config=config)
+        model = XCLIPTextModel(config=config)
         model.to(torch_device)
         model.eval()
         with torch.no_grad():
@@ -392,16 +392,16 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class XClipTextModelTest(ModelTesterMixin, unittest.TestCase):
+class XCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
 
-    all_model_classes = (XClipTextModel,) if is_torch_available() else ()
+    all_model_classes = (XCLIPTextModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
     test_head_masking = False
 
     def setUp(self):
-        self.model_tester = XClipTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=XClipTextConfig, hidden_size=37)
+        self.model_tester = XCLIPTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XCLIPTextConfig, hidden_size=37)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -420,28 +420,28 @@ def test_training_gradient_checkpointing(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="XClipTextModel has no base class and is not available in MODEL_MAPPING")
+    @unittest.skip(reason="XCLIPTextModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip(reason="XClipTextModel has no base class and is not available in MODEL_MAPPING")
+    @unittest.skip(reason="XCLIPTextModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_to_base(self):
         pass
 
     @slow
     def test_model_from_pretrained(self):
         for model_name in X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = XClipTextModel.from_pretrained(model_name)
+            model = XCLIPTextModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
 
-class XClipModelTester:
+class XCLIPModelTester:
     def __init__(self, parent, projection_dim=64, mit_hidden_size=64, is_training=True):
         self.parent = parent
         self.projection_dim = projection_dim
         self.mit_hidden_size = mit_hidden_size
-        self.text_model_tester = XClipTextModelTester(parent)
-        self.vision_model_tester = XClipVisionModelTester(parent)
+        self.text_model_tester = XCLIPTextModelTester(parent)
+        self.vision_model_tester = XCLIPVisionModelTester(parent)
         self.is_training = is_training
 
     def prepare_config_and_inputs(self):
@@ -462,14 +462,14 @@ def prepare_config_and_inputs(self):
         return config, input_ids, attention_mask, pixel_values
 
     def get_config(self):
-        return XClipConfig.from_text_vision_configs(
+        return XCLIPConfig.from_text_vision_configs(
             self.text_model_tester.get_config(),
             self.vision_model_tester.get_config(),
             projection_dim=self.projection_dim,
         )
 
     def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = XClipModel(config).to(torch_device).eval()
+        model = XCLIPModel(config).to(torch_device).eval()
         with torch.no_grad():
             result = model(input_ids, pixel_values, attention_mask)
         self.parent.assertEqual(
@@ -500,8 +500,8 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class XClipModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (XClipModel,) if is_torch_available() else ()
+class XCLIPModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (XCLIPModel,) if is_torch_available() else ()
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -511,7 +511,7 @@ class XClipModelTest(ModelTesterMixin, unittest.TestCase):
     maxdiff = None
 
     def setUp(self):
-        self.model_tester = XClipModelTester(self)
+        self.model_tester = XCLIPModelTester(self)
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -529,11 +529,11 @@ def test_inputs_embeds(self):
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
-    @unittest.skip(reason="XClipModel does not have input/output embeddings")
+    @unittest.skip(reason="XCLIPModel does not have input/output embeddings")
     def test_model_common_attributes(self):
         pass
 
-    @unittest.skip(reason="XClipModel does not support feedforward chunking")
+    @unittest.skip(reason="XCLIPModel does not support feedforward chunking")
     def test_feed_forward_chunking(self):
         pass
 
@@ -617,22 +617,22 @@ def _create_and_check_torchscript(self, config, inputs_dict):
     def test_load_vision_text_config(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        # Save XClipConfig and check if we can load XClipVisionConfig from it
+        # Save XCLIPConfig and check if we can load XCLIPVisionConfig from it
         with tempfile.TemporaryDirectory() as tmp_dir_name:
             config.save_pretrained(tmp_dir_name)
-            vision_config = XClipVisionConfig.from_pretrained(tmp_dir_name)
+            vision_config = XCLIPVisionConfig.from_pretrained(tmp_dir_name)
             self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
 
-        # Save XClipConfig and check if we can load XClipTextConfig from it
+        # Save XCLIPConfig and check if we can load XCLIPTextConfig from it
         with tempfile.TemporaryDirectory() as tmp_dir_name:
             config.save_pretrained(tmp_dir_name)
-            text_config = XClipTextConfig.from_pretrained(tmp_dir_name)
+            text_config = XCLIPTextConfig.from_pretrained(tmp_dir_name)
             self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
 
     @slow
     def test_model_from_pretrained(self):
         for model_name in X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = XClipModel.from_pretrained(model_name)
+            model = XCLIPModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
 
@@ -647,13 +647,13 @@ def prepare_video():
 
 @require_vision
 @require_torch
-class XClipModelIntegrationTest(unittest.TestCase):
+class XCLIPModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference(self):
         # TODO update organization
         model_name = "nielsr/xclip-base-patch32"
-        model = XClipModel.from_pretrained(model_name).to(torch_device)
-        processor = XClipProcessor.from_pretrained(model_name)
+        model = XCLIPModel.from_pretrained(model_name).to(torch_device)
+        processor = XCLIPProcessor.from_pretrained(model_name)
 
         video = prepare_video()
         inputs = processor(
diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py
index f65a589b7889b..c444b58f63274 100644
--- a/utils/check_config_docstrings.py
+++ b/utils/check_config_docstrings.py
@@ -40,7 +40,7 @@
 
 
 CONFIG_CLASSES_TO_IGNORE_FOR_DOCSTRING_CHECKPOINT_CHECK = {
-    "XClipConfig",
+    "XCLIPConfig",
     "CLIPConfig",
     "OwlViTConfig",
     "GroupViTConfig",
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 7fd6cdb670f89..e035468be44c3 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -125,8 +125,8 @@
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     # models to ignore for model xxx mapping
-    "XClipVisionModel",
-    "XClipTextModel",
+    "XCLIPVisionModel",
+    "XCLIPTextModel",
     "DPTForDepthEstimation",
     "DecisionTransformerGPT2Model",
     "GLPNForDepthEstimation",

From 26f8307c64795adc780eb579c6ad373d6f18db60 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Thu, 1 Sep 2022 10:26:28 +0000
Subject: [PATCH 27/40] Extend conversion script

---
 .../convert_x_clip_original_pytorch_to_hf.py  | 28 +++++++++++++++----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index 63fdebeb5a60c..9b9caf749e8ea 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -26,11 +26,18 @@
     XCLIPConfig,
     XCLIPModel,
     XCLIPProcessor,
+    XCLIPTextConfig,
+    XCLIPVisionConfig,
 )
 
 
-def get_xclip_config(model_name):
-    config = XCLIPConfig()
+def get_xclip_config(model_name, num_frames):
+    text_config = XCLIPTextConfig()
+
+    # derive patch size from model name
+    patch_size = int(model_name[-2:])
+    vision_config = XCLIPVisionConfig(patch_size=patch_size, num_frames=num_frames)
+    config = XCLIPConfig.from_text_vision_configs(text_config, vision_config)
     return config
 
 
@@ -178,8 +185,8 @@ def prepare_video():
     return list(video)
 
 
-def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-    config = get_xclip_config(model_name)
+def convert_xclip_checkpoint(checkpoint_url, model_name, num_frames, pytorch_dump_folder_path=None, push_to_hub=False):
+    config = get_xclip_config(model_name, num_frames=num_frames)
     model = XCLIPModel(config)
     model.eval()
 
@@ -207,7 +214,13 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
     # Verify outputs
     logits_per_video = outputs.logits_per_video
     probs = logits_per_video.softmax(dim=1)
-    expected_probs = torch.tensor([[0.0019, 0.9951, 0.0030]])
+    print("Probs:", probs)
+    if model_name == "xclip-base-patch32":
+        expected_probs = torch.tensor([[0.0019, 0.9951, 0.0030]])
+    elif model_name == "xclip-base-patch16":
+        expected_probs = torch.tensor([[0.0083, 0.9681, 0.0236]])
+    else:
+        raise ValueError(f"Model name {model_name} not supported")
     assert torch.allclose(probs, expected_probs, atol=1e-3)
     print("Looks ok!")
 
@@ -237,6 +250,7 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
         type=str,
         help="Name of the model.",
     )
+    parser.add_argument("--num_frames", default=8, type=int, help="Number of frames (can be 8 or 16).")
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
     )
@@ -245,4 +259,6 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, pytorch_dump_folder_pat
     )
 
     args = parser.parse_args()
-    convert_xclip_checkpoint(args.checkpoint_url, args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
+    convert_xclip_checkpoint(
+        args.checkpoint_url, args.model_name, args.num_frames, args.pytorch_dump_folder_path, args.push_to_hub
+    )

From 658027e9b570a204a639b46de56621417e89aad1 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Thu, 1 Sep 2022 11:44:32 +0000
Subject: [PATCH 28/40] Add support for large models

---
 .../convert_x_clip_original_pytorch_to_hf.py  | 28 ++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index 9b9caf749e8ea..c876f4064c142 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -18,6 +18,7 @@
 import numpy as np
 import torch
 
+import gdown
 from huggingface_hub import hf_hub_download
 from transformers import (
     CLIPTokenizer,
@@ -37,7 +38,24 @@ def get_xclip_config(model_name, num_frames):
     # derive patch size from model name
     patch_size = int(model_name[-2:])
     vision_config = XCLIPVisionConfig(patch_size=patch_size, num_frames=num_frames)
+
+    if "large" in model_name:
+        text_config.hidden_size = 768
+        text_config.intermediate_size = 3072
+        text_config.num_attention_heads = 12
+
+        vision_config.hidden_size = 1024
+        vision_config.intermediate_size = 4096
+        vision_config.num_attention_heads = 16
+        vision_config.num_hidden_layers = 24
+        vision_config.mit_hidden_size = 768
+        vision_config.mit_intermediate_size = 3072
+
     config = XCLIPConfig.from_text_vision_configs(text_config, vision_config)
+
+    if "large" in model_name:
+        config.projection_dim = 768
+
     return config
 
 
@@ -190,7 +208,13 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, num_frames, pytorch_dum
     model = XCLIPModel(config)
     model.eval()
 
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]
+    if "drive" in checkpoint_url:
+        output = "pytorch_model.bin"
+        gdown.cached_download(checkpoint_url, output, quiet=False)
+        state_dict = torch.load(output, map_location="cpu")["model"]
+    else:
+        state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]
+
     state_dict = convert_state_dict(state_dict, config)
 
     model = XCLIPModel(config)
@@ -219,6 +243,8 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, num_frames, pytorch_dum
         expected_probs = torch.tensor([[0.0019, 0.9951, 0.0030]])
     elif model_name == "xclip-base-patch16":
         expected_probs = torch.tensor([[0.0083, 0.9681, 0.0236]])
+    elif model_name == "xclip-large-patch14":
+        expected_probs = torch.tensor([[0.0062, 0.9864, 0.0075]])
     else:
         raise ValueError(f"Model name {model_name} not supported")
     assert torch.allclose(probs, expected_probs, atol=1e-3)

From 1c5a560c48d4a83dea3275b37ab556ba8d362dfc Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Thu, 1 Sep 2022 11:58:53 +0000
Subject: [PATCH 29/40] Add support for 16 frame models

---
 .../x_clip/convert_x_clip_original_pytorch_to_hf.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index c876f4064c142..2545e62711c4b 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -36,7 +36,8 @@ def get_xclip_config(model_name, num_frames):
     text_config = XCLIPTextConfig()
 
     # derive patch size from model name
-    patch_size = int(model_name[-2:])
+    start_idx = model_name.find("patch")
+    patch_size = int(model_name[start_idx + len("patch") : start_idx + len("patch") + 2])
     vision_config = XCLIPVisionConfig(patch_size=patch_size, num_frames=num_frames)
 
     if "large" in model_name:
@@ -195,9 +196,11 @@ def convert_state_dict(orig_state_dict, config):
     return orig_state_dict
 
 
-def prepare_video():
+def prepare_video(num_frames):
+    filename = "eating_spaghetti_8_frames.npy" if num_frames == 8 else "eating_spaghetti.npy"
     file = hf_hub_download(
-        repo_id="datasets/hf-internal-testing/spaghetti-video", filename="eating_spaghetti_8_frames.npy"
+        repo_id="datasets/hf-internal-testing/spaghetti-video",
+        filename=filename,
     )
     video = np.load(file)
     return list(video)
@@ -227,7 +230,7 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, num_frames, pytorch_dum
     fast_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
     processor = XCLIPProcessor(feature_extractor=feature_extractor, tokenizer=fast_tokenizer)
 
-    video = prepare_video()
+    video = prepare_video(num_frames)
     inputs = processor(
         text=["playing sports", "eating spaghetti", "go shopping"], videos=video, return_tensors="pt", padding=True
     )
@@ -241,6 +244,8 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, num_frames, pytorch_dum
     print("Probs:", probs)
     if model_name == "xclip-base-patch32":
         expected_probs = torch.tensor([[0.0019, 0.9951, 0.0030]])
+    elif model_name == "xclip-base-patch32-16-frames":
+        expected_probs = torch.tensor([[7.0999e-04, 9.9883e-01, 4.5580e-04]])
     elif model_name == "xclip-base-patch16":
         expected_probs = torch.tensor([[0.0083, 0.9681, 0.0236]])
     elif model_name == "xclip-large-patch14":

From 19cbc88b8262dbfd8fb014c9216008c73ef40fda Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Thu, 1 Sep 2022 12:03:24 +0000
Subject: [PATCH 30/40] Add another model'

---
 .../models/x_clip/convert_x_clip_original_pytorch_to_hf.py      | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index 2545e62711c4b..0db0d9ef6df81 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -248,6 +248,8 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, num_frames, pytorch_dum
         expected_probs = torch.tensor([[7.0999e-04, 9.9883e-01, 4.5580e-04]])
     elif model_name == "xclip-base-patch16":
         expected_probs = torch.tensor([[0.0083, 0.9681, 0.0236]])
+    elif model_name == "xclip-base-patch16-16-frames":
+        expected_probs = torch.tensor([[7.6937e-04, 9.9728e-01, 1.9473e-03]])
     elif model_name == "xclip-large-patch14":
         expected_probs = torch.tensor([[0.0062, 0.9864, 0.0075]])
     else:

From 4b3b1d3dfd681af8743623afac25d63152fd7085 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Thu, 1 Sep 2022 16:06:59 +0000
Subject: [PATCH 31/40] Fix module issue

---
 src/transformers/models/auto/configuration_auto.py |  1 +
 src/transformers/models/auto/tokenization_auto.py  |  2 +-
 tests/models/x_clip/test_modeling_x_clip.py        | 10 ++--------
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index b01eb4bd9bb91..1b6daca6e45a8 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -428,6 +428,7 @@
         ("data2vec-text", "data2vec"),
         ("data2vec-vision", "data2vec"),
         ("donut-swin", "donut"),
+        ("xclip", "x_clip"),
     ]
 )
 
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 3f68227cd852d..a59c4469adbd6 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -254,7 +254,7 @@
             ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
             ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
             (
-                "x-clip",
+                "xclip",
                 (
                     "CLIPTokenizer",
                     "CLIPTokenizerFast" if is_tokenizers_available() else None,
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index b14895e89a04f..4a34361b900ac 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -474,17 +474,11 @@ def create_and_check_model(self, config, input_ids, attention_mask, pixel_values
             result = model(input_ids, pixel_values, attention_mask)
         self.parent.assertEqual(
             result.logits_per_video.shape,
-            (
-                self.vision_model_tester.batch_size,
-                self.text_model_tester.batch_size,
-            ),
+            (self.vision_model_tester.batch_size, self.text_model_tester.batch_size),
         )
         self.parent.assertEqual(
             result.logits_per_text.shape,
-            (
-                self.text_model_tester.batch_size,
-                self.vision_model_tester.batch_size,
-            ),
+            (self.text_model_tester.batch_size, self.vision_model_tester.batch_size),
         )
 
     def prepare_config_and_inputs_for_common(self):

From c1461cd3944800693ca090ccdc3e15e57f274870 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Fri, 2 Sep 2022 08:37:16 +0000
Subject: [PATCH 32/40] Apply suggestions from code review

---
 docs/source/en/model_doc/xclip.mdx            |  2 +-
 src/transformers/__init__.py                  |  8 +--
 src/transformers/models/x_clip/__init__.py    |  8 +--
 .../models/x_clip/configuration_x_clip.py     |  2 +-
 .../models/x_clip/modeling_x_clip.py          | 70 +++++++++++--------
 src/transformers/models/x_clip/test.py        |  8 ++-
 src/transformers/utils/dummy_pt_objects.py    |  2 +-
 tests/models/x_clip/test_modeling_x_clip.py   | 16 ++---
 utils/check_config_docstrings.py              |  2 +-
 utils/check_repo.py                           |  4 +-
 10 files changed, 67 insertions(+), 55 deletions(-)

diff --git a/docs/source/en/model_doc/xclip.mdx b/docs/source/en/model_doc/xclip.mdx
index fbdbf2da44a49..725ea0bc2c07e 100644
--- a/docs/source/en/model_doc/xclip.mdx
+++ b/docs/source/en/model_doc/xclip.mdx
@@ -51,7 +51,7 @@ The original code can be found [here](https://github.com/microsoft/VideoX/tree/m
 [[autodoc]] XCLIPModel
     - forward
     - get_text_features
-    - get_image_features
+    - get_video_features
 
 ## XCLIPTextModel
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5dc10550c15f3..2b25df3250a68 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -369,7 +369,7 @@
         "WavLMConfig",
     ],
     "models.x_clip": [
-        "X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "XCLIPConfig",
         "XCLIPProcessor",
         "XCLIPTextConfig",
@@ -992,7 +992,7 @@
     )
     _import_structure["models.x_clip"].extend(
         [
-            "X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
             "XCLIPModel",
             "XCLIPPreTrainedModel",
             "XCLIPTextModel",
@@ -3182,7 +3182,7 @@
     from .models.wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
     from .models.wavlm import WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP, WavLMConfig
     from .models.x_clip import (
-        X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
         XCLIPConfig,
         XCLIPProcessor,
         XCLIPTextConfig,
@@ -4489,7 +4489,7 @@
             WavLMPreTrainedModel,
         )
         from .models.x_clip import (
-            X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
             XCLIPModel,
             XCLIPPreTrainedModel,
             XCLIPTextModel,
diff --git a/src/transformers/models/x_clip/__init__.py b/src/transformers/models/x_clip/__init__.py
index 613d2903824a6..10d848b7bc4e6 100644
--- a/src/transformers/models/x_clip/__init__.py
+++ b/src/transformers/models/x_clip/__init__.py
@@ -22,7 +22,7 @@
 
 _import_structure = {
     "configuration_x_clip": [
-        "X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "XCLIPConfig",
         "XCLIPTextConfig",
         "XCLIPVisionConfig",
@@ -37,7 +37,7 @@
     pass
 else:
     _import_structure["modeling_x_clip"] = [
-        "X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
         "XCLIPModel",
         "XCLIPPreTrainedModel",
         "XCLIPTextModel",
@@ -46,7 +46,7 @@
 
 if TYPE_CHECKING:
     from .configuration_x_clip import (
-        X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
         XCLIPConfig,
         XCLIPTextConfig,
         XCLIPVisionConfig,
@@ -60,7 +60,7 @@
         pass
     else:
         from .modeling_x_clip import (
-            X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
             XCLIPModel,
             XCLIPPreTrainedModel,
             XCLIPTextModel,
diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index f9dc06590d6df..30f9214eb8b4b 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -24,7 +24,7 @@
 
 logger = logging.get_logger(__name__)
 
-X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "microsoft/xclip-base-patch32": "https://huggingface.co/microsoft/xclip-base-patch32/resolve/main/config.json",
 }
 
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 21ee1c16c4817..0dd401c130272 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -40,7 +40,7 @@
 
 _CHECKPOINT_FOR_DOC = "microsoft/xclip-base-patch32"
 
-X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
     # TODO update to appropriate organization
     "nielsr/xclip-base-patch32",
     # See all X-CLIP models at https://huggingface.co/models?filter=x-clip
@@ -770,7 +770,7 @@ def forward(
 
         bsz, seq_len = input_shape
         # X_CLIP's text model uses causal mask, prepare it here.
-        # https://github.com/openai/X_CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/x_clip/model.py#L324
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
         causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
             hidden_states.device
         )
@@ -1152,17 +1152,29 @@ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
 
     def forward(self, queries, keys, values):
         """Input shape: Batch x Time x Channel"""
-        B, N, C = queries.shape
-        B, M, C = keys.shape
-        q = self.q_proj(queries).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
-        k = self.k_proj(keys).reshape(B, M, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
-        v = self.v_proj(values).reshape(B, M, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        batch_size, query_seq_len, hidden_size = queries.shape
+        batch_size, key_seq_len, hidden_size = keys.shape
+        queries = (
+            self.q_proj(queries)
+            .reshape(batch_size, query_seq_len, self.num_heads, hidden_size // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+        keys = (
+            self.k_proj(keys)
+            .reshape(batch_size, key_seq_len, self.num_heads, hidden_size // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+        values = (
+            self.v_proj(values)
+            .reshape(batch_size, key_seq_len, self.num_heads, hidden_size // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
 
-        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = (queries @ keys.transpose(-2, -1)) * self.scale
         attn = attn.softmax(dim=-1)
         attn = self.attn_drop(attn)
 
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = (attn @ values).transpose(1, 2).reshape(batch_size, query_seq_len, hidden_size)
         x = self.proj(x)
         x = self.proj_drop(x)
         return x
@@ -1297,10 +1309,10 @@ def get_text_features(
             return_dict=return_dict,
         )
 
-        pooled_output = text_outputs[1]
-        text_features = self.text_projection(pooled_output)
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
 
-        return text_features
+        return text_embeds
 
     @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
     def get_video_features(
@@ -1312,8 +1324,9 @@ def get_video_features(
     ) -> torch.FloatTensor:
         r"""
         Returns:
-            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
-            applying the projection layer to the pooled output of [`XCLIPVisionModel`].
+            video_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The video embeddings obtained by
+            applying the projection layer to the pooled output of [`XCLIPVisionModel`] and
+            [`XCLIPMultiframeIntegrationTransformer`].
 
         Examples:
 
@@ -1330,7 +1343,7 @@ def get_video_features(
 
         >>> inputs = processor(images=image, return_tensors="pt")
 
-        >>> image_features = model.get_video_features(**inputs)
+        >>> video_features = model.get_video_features(**inputs)
         ```"""
         # Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1349,19 +1362,20 @@ def get_video_features(
             return_dict=return_dict,
         )
 
-        pooled_output = vision_outputs[1]  # pooled_output
-        image_features = self.visual_projection(pooled_output)
-
-        # TODO add the following:
-        # img_features = self.prompts_visual_ln(img_features)
-        # img_features = img_features @ self.prompts_visual_proj
+        video_embeds = vision_outputs[1]
+        video_embeds = self.visual_projection(video_embeds)
 
-        # cls_features = cls_features.view(b, t, -1)
-        # img_features = img_features.view(b,t,-1,cls_features.shape[-1])
+        cls_features = video_embeds.view(batch_size, num_frames, -1)
 
-        # video_features = self.mit(cls_features)
+        mit_outputs = self.mit(
+            cls_features,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        video_embeds = mit_outputs[1]
 
-        return image_features
+        return video_embeds
 
     @add_start_docstrings_to_model_forward(X_CLIP_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=XCLIPOutput, config_class=XCLIPConfig)
@@ -1417,9 +1431,6 @@ def forward(
             return_dict=return_dict,
         )
 
-        # TODO remove this assertion (vision pooler output)
-        # assert torch.allclose(vision_outputs.pooler_output[0, :3], torch.tensor([-0.2987, 1.0489, 0.3702]), atol=1e-4)
-
         video_embeds = vision_outputs[1]
         video_embeds = self.visual_projection(video_embeds)
 
@@ -1451,9 +1462,6 @@ def forward(
         text_embeds = text_outputs[1]
         text_embeds = self.text_projection(text_embeds)
 
-        # TODO remove this assertion (text pooler output)
-        # assert torch.allclose(text_embeds[0, :3], torch.tensor([-0.2870, -0.3504, 0.0417]), atol=1e-4)
-
         text_embeds = text_embeds.unsqueeze(0).expand(batch_size, -1, -1)
         text_embeds = text_embeds + self.prompts_generator(text_embeds, img_features)
 
diff --git a/src/transformers/models/x_clip/test.py b/src/transformers/models/x_clip/test.py
index bdadbd39a02e4..a6c460825f84c 100644
--- a/src/transformers/models/x_clip/test.py
+++ b/src/transformers/models/x_clip/test.py
@@ -21,6 +21,10 @@
 ).input_ids
 
 
+# with torch.no_grad():
+#     outputs = model(input_ids=input_ids, pixel_values=pixel_values, return_loss=True)
+#     print(outputs[0])
+
 with torch.no_grad():
-    outputs = model(input_ids=input_ids, pixel_values=pixel_values, return_loss=True)
-    print(outputs[0])
+    video_embeds = model.get_video_features(pixel_values)
+    print("Shape of video embeddings:", video_embeds.shape)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 1c9b7531932e9..c974a3ab502a0 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5168,7 +5168,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
+XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
 class XCLIPModel(metaclass=DummyObject):
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 4a34361b900ac..9023c6ff6dca8 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -42,7 +42,7 @@
     from torch import nn
 
     from transformers import XCLIPModel, XCLIPTextModel, XCLIPVisionModel
-    from transformers.models.x_clip.modeling_x_clip import X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.models.x_clip.modeling_x_clip import XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
 if is_vision_available():
@@ -139,7 +139,7 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class XCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
-    Here we also overwrite some of the tests of test_modeling_common.py, as X_CLIP does not use input_ids, inputs_embeds,
+    Here we also overwrite some of the tests of test_modeling_common.py, as X-CLIP does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
@@ -158,7 +158,7 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="X_CLIP does not use inputs_embeds")
+    @unittest.skip(reason="X-CLIP does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
@@ -203,7 +203,7 @@ def test_save_load_fast_init_to_base(self):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = XCLIPVisionModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
@@ -416,7 +416,7 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(reason="X_CLIP does not use inputs_embeds")
+    @unittest.skip(reason="X-CLIP does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
@@ -430,7 +430,7 @@ def test_save_load_fast_init_to_base(self):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = XCLIPTextModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
@@ -571,7 +571,7 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             try:
                 input_ids = inputs_dict["input_ids"]
-                pixel_values = inputs_dict["pixel_values"]  # X_CLIP needs pixel_values
+                pixel_values = inputs_dict["pixel_values"]  # X-CLIP needs pixel_values
                 traced_model = torch.jit.trace(model, (input_ids, pixel_values))
             except RuntimeError:
                 self.fail("Couldn't trace module.")
@@ -625,7 +625,7 @@ def test_load_vision_text_config(self):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in X_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = XCLIPModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py
index c444b58f63274..b0e5d1ced5168 100644
--- a/utils/check_config_docstrings.py
+++ b/utils/check_config_docstrings.py
@@ -40,7 +40,6 @@
 
 
 CONFIG_CLASSES_TO_IGNORE_FOR_DOCSTRING_CHECKPOINT_CHECK = {
-    "XCLIPConfig",
     "CLIPConfig",
     "OwlViTConfig",
     "GroupViTConfig",
@@ -50,6 +49,7 @@
     "SpeechEncoderDecoderConfig",
     "VisionEncoderDecoderConfig",
     "VisionTextDualEncoderConfig",
+    "XCLIPConfig",
 }
 
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index e035468be44c3..517d583b9ec59 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -125,8 +125,6 @@
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     # models to ignore for model xxx mapping
-    "XCLIPVisionModel",
-    "XCLIPTextModel",
     "DPTForDepthEstimation",
     "DecisionTransformerGPT2Model",
     "GLPNForDepthEstimation",
@@ -200,6 +198,8 @@
     "TFWav2Vec2ForCTC",
     "TFHubertForCTC",
     "MaskFormerForInstanceSegmentation",
+    "XCLIPVisionModel",
+    "XCLIPTextModel",
 ]
 
 # Update this list for models that have multiple model types for the same

From 2ceb582f9c80ccef9e45e4a1e344f87a1881448a Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Fri, 2 Sep 2022 09:26:08 +0000
Subject: [PATCH 33/40] Add figure to docs

---
 docs/source/en/model_doc/xclip.mdx                | 5 +++++
 src/transformers/models/x_clip/modeling_x_clip.py | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/docs/source/en/model_doc/xclip.mdx b/docs/source/en/model_doc/xclip.mdx
index 725ea0bc2c07e..4d572b6760071 100644
--- a/docs/source/en/model_doc/xclip.mdx
+++ b/docs/source/en/model_doc/xclip.mdx
@@ -25,6 +25,11 @@ Tips:
 
 - Usage of X-CLIP is identical to CLIP.
 
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/xclip_architecture.png"
+alt="drawing" width="600"/> 
+
+<small> X-CLIP architecture. Taken from the <a href="https://arxiv.org/abs/2208.02816">original paper.</a> </small>
+
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/microsoft/VideoX/tree/master/X-CLIP).
 
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 0dd401c130272..fb75dafb4e0cd 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -424,6 +424,11 @@ def forward(
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                 `(config.encoder_attention_heads,)`.
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.

From 9f4b3dcc509a71d6a9e171922da3a2a564930b8d Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Fri, 2 Sep 2022 12:01:30 +0000
Subject: [PATCH 34/40] Fix CLIPProcessor issue

---
 src/transformers/models/clip/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/clip/__init__.py b/src/transformers/models/clip/__init__.py
index 932130f8d5fdf..637d78b0da799 100644
--- a/src/transformers/models/clip/__init__.py
+++ b/src/transformers/models/clip/__init__.py
@@ -36,6 +36,7 @@
         "CLIPTextConfig",
         "CLIPVisionConfig",
     ],
+    "processing_clip": ["CLIPProcessor"],
     "tokenization_clip": ["CLIPTokenizer"],
 }
 
@@ -54,7 +55,6 @@
     pass
 else:
     _import_structure["feature_extraction_clip"] = ["CLIPFeatureExtractor"]
-    _import_structure["processing_clip"] = ["CLIPProcessor"]
 
 try:
     if not is_torch_available():
@@ -108,6 +108,7 @@
         CLIPTextConfig,
         CLIPVisionConfig,
     )
+    from .processing_clip import CLIPProcessor
     from .tokenization_clip import CLIPTokenizer
 
     try:
@@ -125,7 +126,6 @@
         pass
     else:
         from .feature_extraction_clip import CLIPFeatureExtractor
-        from .processing_clip import CLIPProcessor
 
     try:
         if not is_torch_available():

From a110fe3a94e17c79f0ed6cec4ef7a863df869ecd Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 7 Sep 2022 13:36:02 +0000
Subject: [PATCH 35/40] Apply suggestions from code review

---
 .../models/x_clip/modeling_x_clip.py          | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index fb75dafb4e0cd..b879714ac8d83 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -397,7 +397,6 @@ class XCLIPVisionEncoderLayer(nn.Module):
     def __init__(self, config: XCLIPConfig):
         super().__init__()
         self.num_frames = config.num_frames
-
         self.embed_dim = config.hidden_size
 
         self.message_fc = nn.Linear(self.embed_dim, self.embed_dim)
@@ -773,15 +772,15 @@ def forward(
 
         hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
-        bsz, seq_len = input_shape
+        batch_size, seq_len = input_shape
         # X_CLIP's text model uses causal mask, prepare it here.
         # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
+        causal_attention_mask = self._build_causal_attention_mask(batch_size, seq_len, hidden_states.dtype).to(
             hidden_states.device
         )
         # expand attention_mask
         if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
             attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
 
         encoder_outputs = self.encoder(
@@ -810,10 +809,10 @@ def forward(
             attentions=encoder_outputs.attentions,
         )
 
-    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
+    def _build_causal_attention_mask(self, batch_size, seq_len, dtype):
         # lazily create causal attention mask, with full attention between the vision tokens
         # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
+        mask = torch.empty(batch_size, seq_len, seq_len, dtype=dtype)
         mask.fill_(torch.tensor(torch.finfo(dtype).min))
         mask.triu_(1)  # zero out the lower diagonal
         mask = mask.unsqueeze(1)  # expand mask
@@ -1065,10 +1064,10 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, XCLIPVisionModel
+        >>> from transformers import XCLIPProcessor, XCLIPVisionModel
 
         >>> model = XCLIPVisionModel.from_pretrained("microsoft/xclip-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
+        >>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1152,8 +1151,8 @@ def __init__(self, config):
         self.proj = nn.Linear(dim, dim)
         self.proj_drop = nn.Dropout(config.prompt_projection_dropout)
 
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
     def forward(self, queries, keys, values):
         """Input shape: Batch x Time x Channel"""
@@ -1338,10 +1337,10 @@ def get_video_features(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, XCLIPModel
+        >>> from transformers import XCLIPProcessor, XCLIPModel
 
         >>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
+        >>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1403,10 +1402,10 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, XCLIPModel
+        >>> from transformers import XCLIPProcessor, XCLIPModel
 
         >>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
+        >>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)

From 04d75382699f1f256abe9d15009b349431893902 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 7 Sep 2022 13:44:50 +0000
Subject: [PATCH 36/40] Delete file

---
 src/transformers/models/x_clip/test.py | 30 --------------------------
 1 file changed, 30 deletions(-)
 delete mode 100644 src/transformers/models/x_clip/test.py

diff --git a/src/transformers/models/x_clip/test.py b/src/transformers/models/x_clip/test.py
deleted file mode 100644
index a6c460825f84c..0000000000000
--- a/src/transformers/models/x_clip/test.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import torch
-
-from huggingface_hub import hf_hub_download
-from transformers import AutoTokenizer, XCLIPConfig, XCLIPModel
-
-
-config = XCLIPConfig()
-model = XCLIPModel(config)
-
-file_path = hf_hub_download(
-    repo_id="hf-internal-testing/spaghetti-video-8-frames", filename="pixel_values.pt", repo_type="dataset"
-)
-pixel_values = torch.load(file_path)
-
-pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
-print("Shape of pixel values:", pixel_values.shape)
-
-tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-input_ids = tokenizer(
-    ["playing sports", "eating spaghetti", "go shopping"], padding="max_length", return_tensors="pt"
-).input_ids
-
-
-# with torch.no_grad():
-#     outputs = model(input_ids=input_ids, pixel_values=pixel_values, return_loss=True)
-#     print(outputs[0])
-
-with torch.no_grad():
-    video_embeds = model.get_video_features(pixel_values)
-    print("Shape of video embeddings:", video_embeds.shape)

From c5e2d4b516c84fa666bd6d8f68741b9078262cad Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 7 Sep 2022 17:15:44 +0000
Subject: [PATCH 37/40] Convert more checkpoints

---
 .../convert_x_clip_original_pytorch_to_hf.py  | 104 +++++++++++++++---
 1 file changed, 90 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index 0db0d9ef6df81..f694b1bfa55a0 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -52,6 +52,9 @@ def get_xclip_config(model_name, num_frames):
         vision_config.mit_hidden_size = 768
         vision_config.mit_intermediate_size = 3072
 
+    if model_name == "xclip-large-patch14-16-frames":
+        vision_config.image_size = 336
+
     config = XCLIPConfig.from_text_vision_configs(text_config, vision_config)
 
     if "large" in model_name:
@@ -197,7 +200,12 @@ def convert_state_dict(orig_state_dict, config):
 
 
 def prepare_video(num_frames):
-    filename = "eating_spaghetti_8_frames.npy" if num_frames == 8 else "eating_spaghetti.npy"
+    if num_frames == 8:
+        filename = "eating_spaghetti_8_frames.npy"
+    elif num_frames == 16:
+        filename = "eating_spaghetti.npy"
+    elif num_frames == 32:
+        filename = "eating_spaghetti_32_frames.npy"
     file = hf_hub_download(
         repo_id="datasets/hf-internal-testing/spaghetti-video",
         filename=filename,
@@ -206,8 +214,66 @@ def prepare_video(num_frames):
     return list(video)
 
 
-def convert_xclip_checkpoint(checkpoint_url, model_name, num_frames, pytorch_dump_folder_path=None, push_to_hub=False):
-    config = get_xclip_config(model_name, num_frames=num_frames)
+def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
+
+    model_to_url = {
+        # fully supervised kinetics-400 checkpoints
+        "xclip-base-patch32": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_8.pth",
+        "xclip-base-patch32-16-frames": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_16.pth"
+        ),
+        "xclip-base-patch16": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_16_8.pth",
+        "xclip-base-patch16-16-frames": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_16_16.pth"
+        ),
+        "xclip-large-patch14": "https://drive.google.com/u/0/uc?id=1NUOImq0o5DlQTST17iIP3vG7DgmHQuCx&amp;export=download&amp;confirm=t&amp;uuid=b26caedc-88e2-473e-830a-9d158b653cdb",
+        "xclip-large-patch14-16-frames": "https://drive.google.com/u/0/uc?id=1FOYgnJc097OJ4lGwtRCCydQyVPJEOH7d&amp;export=download&amp;confirm=t&amp;uuid=538fa810-e671-4050-b385-9a623f89804f",
+        # fully supervised kinetics-600 checkpoints
+        "xclip-base-patch16-kinetics-600": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k600_16_8.pth"
+        ),
+        "xclip-base-patch16-kinetics-600-16-frames": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k600_16_16.pth"
+        ),
+        "xclip-large-patch14-kinetics-600": (
+            "https://drive.google.com/file/d/1FV8C1INuM91sLAN4ImjzePLIlpMSihwV/view?usp=sharing"
+        ),
+        # few shot
+        "xclip-base-patch16-hmdb-2-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_2.pth"
+        ),
+        "xclip-base-patch16-hmdb-4-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_4.pth"
+        ),
+        "xclip-base-patch16-hmdb-8-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_8.pth"
+        ),
+        "xclip-base-patch16-hmdb-16-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_16.pth"
+        ),
+        "xclip-base-patch16-ucf-2-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_2.pth"
+        ),
+        "xclip-base-patch16-ucf-4-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_4.pth"
+        ),
+        "xclip-base-patch16-ucf-8-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_8.pth"
+        ),
+        "xclip-base-patch16-ucf-16-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_16.pth"
+        ),
+        # zero shot
+    }
+
+    checkpoint_url = model_to_url[model_name]
+    num_frames = 8
+    if "16-frames" in model_name:
+        num_frames = 16
+    elif "shot" in model_name:
+        num_frames = 32
+
+    config = get_xclip_config(model_name, num_frames)
     model = XCLIPModel(config)
     model.eval()
 
@@ -225,7 +291,8 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, num_frames, pytorch_dum
     assert missing_keys == ["text_model.embeddings.position_ids", "vision_model.embeddings.position_ids"]
     model.eval()
 
-    feature_extractor = VideoMAEFeatureExtractor()
+    size = 336 if model_name == "xclip-large-patch14-16-frames" else 224
+    feature_extractor = VideoMAEFeatureExtractor(size=size)
     slow_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
     fast_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
     processor = XCLIPProcessor(feature_extractor=feature_extractor, tokenizer=fast_tokenizer)
@@ -235,6 +302,8 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, num_frames, pytorch_dum
         text=["playing sports", "eating spaghetti", "go shopping"], videos=video, return_tensors="pt", padding=True
     )
 
+    print("Shape of pixel values:", inputs.pixel_values.shape)
+
     with torch.no_grad():
         outputs = model(**inputs)
 
@@ -252,6 +321,22 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, num_frames, pytorch_dum
         expected_probs = torch.tensor([[7.6937e-04, 9.9728e-01, 1.9473e-03]])
     elif model_name == "xclip-large-patch14":
         expected_probs = torch.tensor([[0.0062, 0.9864, 0.0075]])
+    elif model_name == "xclip-large-patch14-16-frames":
+        expected_probs = torch.tensor([[3.3877e-04, 9.9937e-01, 2.8888e-04]])
+    elif model_name == "xclip-base-patch16-hmdb-2-shot":
+        expected_probs = torch.tensor([[7.1890e-06, 9.9994e-01, 5.6559e-05]])
+    elif model_name == "xclip-base-patch16-hmdb-4-shot":
+        expected_probs = torch.tensor([[1.0320e-05, 9.9993e-01, 6.2435e-05]])
+    elif model_name == "xclip-base-patch16-hmdb-8-shot":
+        expected_probs = torch.tensor([[4.1377e-06, 9.9990e-01, 9.8386e-05]])
+    elif model_name == "xclip-base-patch16-hmdb-16-shot":
+        expected_probs = torch.tensor([[4.1347e-05, 9.9962e-01, 3.3411e-04]])
+    elif model_name == "xclip-base-patch16-ucf-2-shot":
+        expected_probs = torch.tensor([[8.5857e-05, 9.9928e-01, 6.3291e-04]])
+    elif model_name == "xclip-base-patch16-ucf-4-shot":
+        expected_probs = torch.tensor([[8.5857e-05, 9.9928e-01, 6.3291e-04]])
+    elif model_name == "xclip-base-patch16-ucf-8-shot":
+        expected_probs = torch.tensor([[0.0027, 0.9904, 0.0070]])
     else:
         raise ValueError(f"Model name {model_name} not supported")
     assert torch.allclose(probs, expected_probs, atol=1e-3)
@@ -271,19 +356,12 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, num_frames, pytorch_dum
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_8.pth",
-        type=str,
-        help="URL fo the original PyTorch checkpoint (.pth file).",
-    )
     parser.add_argument(
         "--model_name",
         default="xclip-base-patch32",
         type=str,
         help="Name of the model.",
     )
-    parser.add_argument("--num_frames", default=8, type=int, help="Number of frames (can be 8 or 16).")
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
     )
@@ -292,6 +370,4 @@ def convert_xclip_checkpoint(checkpoint_url, model_name, num_frames, pytorch_dum
     )
 
     args = parser.parse_args()
-    convert_xclip_checkpoint(
-        args.checkpoint_url, args.model_name, args.num_frames, args.pytorch_dump_folder_path, args.push_to_hub
-    )
+    convert_xclip_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

From a04da921b4fa6514ddc5716b20fe65a029d5d51a Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Thu, 8 Sep 2022 09:02:20 +0000
Subject: [PATCH 38/40] Convert last checkpoint

---
 .../models/x_clip/convert_x_clip_original_pytorch_to_hf.py   | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index f694b1bfa55a0..6d2edcdbbdf0a 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -264,6 +264,7 @@ def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
             "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_16.pth"
         ),
         # zero shot
+        "xclip-base-patch16-zero-shot": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/zero.pth",
     }
 
     checkpoint_url = model_to_url[model_name]
@@ -337,6 +338,10 @@ def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
         expected_probs = torch.tensor([[8.5857e-05, 9.9928e-01, 6.3291e-04]])
     elif model_name == "xclip-base-patch16-ucf-8-shot":
         expected_probs = torch.tensor([[0.0027, 0.9904, 0.0070]])
+    elif model_name == "xclip-base-patch16-ucf-16-shot":
+        expected_probs = torch.tensor([[9.8219e-04, 9.9593e-01, 3.0863e-03]])
+    elif model_name == "xclip-base-patch16-zero-shot":
+        expected_probs = torch.tensor([[3.5082e-04, 9.9785e-01, 1.7966e-03]])
     else:
         raise ValueError(f"Model name {model_name} not supported")
     assert torch.allclose(probs, expected_probs, atol=1e-3)

From eafedc6c2ae1dc9a4b14d2dceb0760b9498b1f99 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Thu, 8 Sep 2022 09:04:13 +0000
Subject: [PATCH 39/40] Update nielsr to microsoft

---
 src/transformers/models/x_clip/modeling_x_clip.py | 3 +--
 tests/models/x_clip/test_modeling_x_clip.py       | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index b879714ac8d83..00ae9d720602a 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -41,8 +41,7 @@
 _CHECKPOINT_FOR_DOC = "microsoft/xclip-base-patch32"
 
 XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    # TODO update to appropriate organization
-    "nielsr/xclip-base-patch32",
+    "microsoft/xclip-base-patch32",
     # See all X-CLIP models at https://huggingface.co/models?filter=x-clip
 ]
 
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 9023c6ff6dca8..62c8e9992b0d9 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -644,8 +644,7 @@ def prepare_video():
 class XCLIPModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference(self):
-        # TODO update organization
-        model_name = "nielsr/xclip-base-patch32"
+        model_name = "microsoft/xclip-base-patch32"
         model = XCLIPModel.from_pretrained(model_name).to(torch_device)
         processor = XCLIPProcessor.from_pretrained(model_name)
 

From b14228ff42ca18f66775ecc8274d8963732d69d9 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Thu, 8 Sep 2022 12:32:16 +0000
Subject: [PATCH 40/40] Add remaining models, apply suggestion

---
 src/transformers/models/auto/tokenization_auto.py  |  8 +-------
 .../convert_x_clip_original_pytorch_to_hf.py       | 14 +++++++++++---
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index a59c4469adbd6..9eb802b1fb1d8 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -253,13 +253,7 @@
             ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
             ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
             ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
-            (
-                "xclip",
-                (
-                    "CLIPTokenizer",
-                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
-                ),
-            ),
+            ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "xglm",
                 (
diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index 6d2edcdbbdf0a..2f5364f440986 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -235,9 +235,7 @@ def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
         "xclip-base-patch16-kinetics-600-16-frames": (
             "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k600_16_16.pth"
         ),
-        "xclip-large-patch14-kinetics-600": (
-            "https://drive.google.com/file/d/1FV8C1INuM91sLAN4ImjzePLIlpMSihwV/view?usp=sharing"
-        ),
+        "xclip-large-patch14-kinetics-600": "https://drive.google.com/u/0/uc?id=1FV8C1INuM91sLAN4ImjzePLIlpMSihwV&amp;export=download&amp;confirm=t&amp;uuid=141d4977-4a65-44ae-864f-4b0c19f838be",
         # few shot
         "xclip-base-patch16-hmdb-2-shot": (
             "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_2.pth"
@@ -312,6 +310,7 @@ def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
     logits_per_video = outputs.logits_per_video
     probs = logits_per_video.softmax(dim=1)
     print("Probs:", probs)
+    # kinetics-400
     if model_name == "xclip-base-patch32":
         expected_probs = torch.tensor([[0.0019, 0.9951, 0.0030]])
     elif model_name == "xclip-base-patch32-16-frames":
@@ -324,6 +323,14 @@ def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
         expected_probs = torch.tensor([[0.0062, 0.9864, 0.0075]])
     elif model_name == "xclip-large-patch14-16-frames":
         expected_probs = torch.tensor([[3.3877e-04, 9.9937e-01, 2.8888e-04]])
+    # kinetics-600
+    elif model_name == "xclip-base-patch16-kinetics-600":
+        expected_probs = torch.tensor([[0.0555, 0.8914, 0.0531]])
+    elif model_name == "xclip-base-patch16-kinetics-600-16-frames":
+        expected_probs = torch.tensor([[3.8554e-04, 9.9929e-01, 3.2754e-04]])
+    elif model_name == "xclip-large-patch14-kinetics-600":
+        expected_probs = torch.tensor([[0.0036, 0.9920, 0.0045]])
+    # few shot
     elif model_name == "xclip-base-patch16-hmdb-2-shot":
         expected_probs = torch.tensor([[7.1890e-06, 9.9994e-01, 5.6559e-05]])
     elif model_name == "xclip-base-patch16-hmdb-4-shot":
@@ -340,6 +347,7 @@ def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
         expected_probs = torch.tensor([[0.0027, 0.9904, 0.0070]])
     elif model_name == "xclip-base-patch16-ucf-16-shot":
         expected_probs = torch.tensor([[9.8219e-04, 9.9593e-01, 3.0863e-03]])
+    # zero shot
     elif model_name == "xclip-base-patch16-zero-shot":
         expected_probs = torch.tensor([[3.5082e-04, 9.9785e-01, 1.7966e-03]])
     else: