From 7c2fcf8c007a5708608fe3c3edbd1e329e3e9acf Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 24 Jul 2022 16:38:32 +0200
Subject: [PATCH 01/35] First draft

---
 .../models/donut/convert_donut_to_pytorch.py  | 164 ++++++++++++++++++
 src/transformers/models/swin/modeling_swin.py |   1 +
 .../convert_trocr_unilm_to_pytorch.py         |   0
 3 files changed, 165 insertions(+)
 create mode 100644 src/transformers/models/donut/convert_donut_to_pytorch.py
 rename src/transformers/models/{vision_encoder_decoder => trocr}/convert_trocr_unilm_to_pytorch.py (100%)

diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
new file mode 100644
index 0000000000000..8bef365ca6e61
--- /dev/null
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -0,0 +1,164 @@
+import argparse
+
+import torch
+from datasets import load_dataset
+from PIL import Image
+
+from donut import DonutModel
+from transformers import MBartConfig, MBartForCausalLM, SwinConfig, SwinModel, VisionEncoderDecoderModel
+
+
+def get_configs():
+    encoder_config = SwinConfig(
+        image_size=[1280, 960],
+        patch_size=4,
+        depths=[2, 2, 14, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=10,
+        embed_dim=128,
+    )
+    decoder_config = MBartConfig(
+        is_decoder=True,
+        is_encoder_decoder=False,
+        add_cross_attention=True,
+        decoder_layers=4,
+        max_position_embeddings=768,
+        vocab_size=57580,  # several special tokens are added to the vocab of XLMRobertaTokenizer, see repo on the hub (added_tokens.json)
+        scale_embedding=True,
+        add_final_layer_norm=True,
+    )
+
+    return encoder_config, decoder_config
+
+
+def rename_key(name):
+    if "encoder.model" in name:
+        name = name.replace("encoder.model", "encoder")
+    if "decoder.model" in name:
+        name = name.replace("decoder.model", "decoder")
+    if "patch_embed.proj" in name:
+        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
+    if "patch_embed.norm" in name:
+        name = name.replace("patch_embed.norm", "embeddings.norm")
+    if name.startswith("encoder"):
+        if "layers" in name:
+            name = "encoder." + name
+        if "attn.proj" in name:
+            name = name.replace("attn.proj", "attention.output.dense")
+        if "attn" in name and "mask" not in name:
+            name = name.replace("attn", "attention.self")
+        if "norm1" in name:
+            name = name.replace("norm1", "layernorm_before")
+        if "norm2" in name:
+            name = name.replace("norm2", "layernorm_after")
+        if "mlp.fc1" in name:
+            name = name.replace("mlp.fc1", "intermediate.dense")
+        if "mlp.fc2" in name:
+            name = name.replace("mlp.fc2", "output.dense")
+
+        if name == "encoder.norm.weight":
+            name = "encoder.layernorm.weight"
+        if name == "encoder.norm.bias":
+            name = "encoder.layernorm.bias"
+
+    return name
+
+
+def convert_state_dict(orig_state_dict, model):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if "qkv" in key:
+            key_split = key.split(".")
+            layer_num = int(key_split[3])
+            block_num = int(key_split[5])
+            dim = model.encoder.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
+
+            if "weight" in key:
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
+                ] = val[:dim, :]
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"
+                ] = val[dim : dim * 2, :]
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
+                ] = val[-dim:, :]
+            else:
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"
+                ] = val[:dim]
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"
+                ] = val[dim : dim * 2]
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
+                ] = val[-dim:]
+        elif "attn_mask" in key:
+            # TODO check attn_mask buffers
+            pass
+        else:
+            orig_state_dict[rename_key(key)] = val
+
+    return orig_state_dict
+
+
+def convert_swin_checkpoint(checkpoint_path, pytorch_dump_folder_path):
+    encoder_config, decoder_config = get_configs()
+    encoder = SwinModel(encoder_config)
+    decoder = MBartForCausalLM(decoder_config)
+    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
+    model.eval()
+
+    state_dict = torch.load(checkpoint_path, map_location="cpu")
+    new_state_dict = convert_state_dict(state_dict, model)
+    model.load_state_dict(new_state_dict)
+
+    # load original model
+    original_model = DonutModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
+
+    # verify results on scanned document
+    dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
+    image = Image.open(dataset["test"][0]["file"]).convert("RGB")
+
+    pixel_values = original_model.encoder.prepare_input(image).unsqueeze(0)
+
+    task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
+    question = "When is the coffee break?"
+    user_prompt = task_prompt.replace("{user_input}", question)
+
+    last_hidden_state = original_model.encoder(pixel_values)
+
+    print("First values of last_hidden_state:", last_hidden_state[0, :3, :3])
+
+    outputs = model.encoder(pixel_values, output_hidden_states=True)
+    print("Shape of last hidden state HuggingFace one:", outputs.last_hidden_state[0, :3, :3])
+
+    # TODO assert outputs
+    # assert torch.allclose(timm_outs, hf_outs, atol=1e-3)
+
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        # feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--checkpoint_path",
+        default="/Users/nielsrogge/Documents/Donut/pytorch_model.bin",
+        type=str,
+        help="Path to the original checkpoint you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        required=False,
+        type=str,
+        help="Path to the output PyTorch model directory.",
+    )
+
+    args = parser.parse_args()
+    convert_swin_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index 48c9b8cccf9ec..bc1ef11b81906 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -667,6 +667,7 @@ def forward(
         hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
         hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
         attn_mask = self.get_attn_mask(height_pad, width_pad)
+        self.register_buffer("attn_mask", attn_mask)
         if attn_mask is not None:
             attn_mask = attn_mask.to(hidden_states_windows.device)
 
diff --git a/src/transformers/models/vision_encoder_decoder/convert_trocr_unilm_to_pytorch.py b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
similarity index 100%
rename from src/transformers/models/vision_encoder_decoder/convert_trocr_unilm_to_pytorch.py
rename to src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py

From 3a2ebc83d7cedf5f3f62e51f6768273383749d84 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 24 Jul 2022 16:57:25 +0200
Subject: [PATCH 02/35] Improve script

---
 .../models/donut/convert_donut_to_pytorch.py  | 61 +++++++++++++------
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index 8bef365ca6e61..956bbbb50797c 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -1,29 +1,49 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DONUT checkpoints using the original `donut-python`library. URL: https://github.com/clovaai/donut"""
+
 import argparse
 
 import torch
 from datasets import load_dataset
 from PIL import Image
 
-from donut import DonutModel
+from donut import DonutConfig, DonutModel
 from transformers import MBartConfig, MBartForCausalLM, SwinConfig, SwinModel, VisionEncoderDecoderModel
 
 
-def get_configs():
+def get_configs(model):
+    original_config = model.config
+
     encoder_config = SwinConfig(
-        image_size=[1280, 960],
+        image_size=original_config.input_size,
         patch_size=4,
-        depths=[2, 2, 14, 2],
+        depths=original_config.encoder_layer,
         num_heads=[4, 8, 16, 32],
-        window_size=10,
+        window_size=original_config.window_size,
         embed_dim=128,
     )
     decoder_config = MBartConfig(
         is_decoder=True,
         is_encoder_decoder=False,
         add_cross_attention=True,
-        decoder_layers=4,
-        max_position_embeddings=768,
-        vocab_size=57580,  # several special tokens are added to the vocab of XLMRobertaTokenizer, see repo on the hub (added_tokens.json)
+        decoder_layers=original_config.decoder_layer,
+        max_position_embeddings=original_config.max_position_embeddings,
+        vocab_size=len(
+            model.decoder.tokenizer
+        ),  # several special tokens are added to the vocab of XLMRobertaTokenizer, see repo on the hub (added_tokens.json)
         scale_embedding=True,
         add_final_layer_norm=True,
     )
@@ -103,20 +123,21 @@ def convert_state_dict(orig_state_dict, model):
     return orig_state_dict
 
 
-def convert_swin_checkpoint(checkpoint_path, pytorch_dump_folder_path):
-    encoder_config, decoder_config = get_configs()
+def convert_swin_checkpoint(model_name, pytorch_dump_folder_path):
+    # load original model
+    original_model = DonutModel.from_pretrained(model_name)
+
+    # load HuggingFace model
+    encoder_config, decoder_config = get_configs(original_model)
     encoder = SwinModel(encoder_config)
     decoder = MBartForCausalLM(decoder_config)
     model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
     model.eval()
 
-    state_dict = torch.load(checkpoint_path, map_location="cpu")
+    state_dict = original_model.state_dict()
     new_state_dict = convert_state_dict(state_dict, model)
     model.load_state_dict(new_state_dict)
 
-    # load original model
-    original_model = DonutModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
-
     # verify results on scanned document
     dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
     image = Image.open(dataset["test"][0]["file"]).convert("RGB")
@@ -134,8 +155,8 @@ def convert_swin_checkpoint(checkpoint_path, pytorch_dump_folder_path):
     outputs = model.encoder(pixel_values, output_hidden_states=True)
     print("Shape of last hidden state HuggingFace one:", outputs.last_hidden_state[0, :3, :3])
 
-    # TODO assert outputs
-    # assert torch.allclose(timm_outs, hf_outs, atol=1e-3)
+    assert torch.allclose(last_hidden_state, outputs.last_hidden_state, atol=1e-3)
+    print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
         print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")
@@ -147,10 +168,10 @@ def convert_swin_checkpoint(checkpoint_path, pytorch_dump_folder_path):
     parser = argparse.ArgumentParser()
     # Required parameters
     parser.add_argument(
-        "--checkpoint_path",
-        default="/Users/nielsrogge/Documents/Donut/pytorch_model.bin",
+        "--model_name",
+        default="naver-clova-ix/donut-base-finetuned-docvqa",
         type=str,
-        help="Path to the original checkpoint you'd like to convert.",
+        help="Name of the original model you'd like to convert.",
     )
     parser.add_argument(
         "--pytorch_dump_folder_path",
@@ -161,4 +182,4 @@ def convert_swin_checkpoint(checkpoint_path, pytorch_dump_folder_path):
     )
 
     args = parser.parse_args()
-    convert_swin_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path)
+    convert_swin_checkpoint(args.model_name, args.pytorch_dump_folder_path)

From cb7dda31ced8207f1bb21098b86dbcaa5be53264 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 24 Jul 2022 17:29:49 +0200
Subject: [PATCH 03/35] Update script

---
 .../models/donut/convert_donut_to_pytorch.py  | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index 956bbbb50797c..97993aea34ae4 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -16,11 +16,10 @@
 
 import argparse
 
-import torch
 from datasets import load_dataset
 from PIL import Image
 
-from donut import DonutConfig, DonutModel
+from donut import DonutModel
 from transformers import MBartConfig, MBartForCausalLM, SwinConfig, SwinModel, VisionEncoderDecoderModel
 
 
@@ -125,7 +124,7 @@ def convert_state_dict(orig_state_dict, model):
 
 def convert_swin_checkpoint(model_name, pytorch_dump_folder_path):
     # load original model
-    original_model = DonutModel.from_pretrained(model_name)
+    original_model = DonutModel.from_pretrained(model_name).eval()
 
     # load HuggingFace model
     encoder_config, decoder_config = get_configs(original_model)
@@ -148,15 +147,24 @@ def convert_swin_checkpoint(model_name, pytorch_dump_folder_path):
     question = "When is the coffee break?"
     user_prompt = task_prompt.replace("{user_input}", question)
 
-    last_hidden_state = original_model.encoder(pixel_values)
+    original_patch_embed = original_model.encoder.model.patch_embed(pixel_values)
+
+    print("Shape of original patch embeddings:", original_patch_embed.shape)
+    print("Original patch embeddings:", original_patch_embed[0, :3, :3])
+
+    patch_embeddings, _ = model.encoder.embeddings(pixel_values)
 
-    print("First values of last_hidden_state:", last_hidden_state[0, :3, :3])
+    print("Shape of patch embeddings:", patch_embeddings.shape)
+    print("HuggingFace patch embeddings:", patch_embeddings[0, :3, :3])
+
+    last_hidden_state = original_model.encoder(pixel_values)
+    print("First values of original last_hidden_state:", last_hidden_state[0, :3, :3])
 
     outputs = model.encoder(pixel_values, output_hidden_states=True)
-    print("Shape of last hidden state HuggingFace one:", outputs.last_hidden_state[0, :3, :3])
+    print("Shape of last hidden state HuggingFace one:", outputs.last_hidden_states[-1][0, :3, :3])
 
-    assert torch.allclose(last_hidden_state, outputs.last_hidden_state, atol=1e-3)
-    print("Looks ok!")
+    # assert torch.allclose(last_hidden_state, outputs.last_hidden_state, atol=1e-3)
+    # print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
         print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")

From 2c5e831ba197c6fe2afa6f9e88d8604df6a604db Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 24 Jul 2022 19:38:46 +0200
Subject: [PATCH 04/35] Make conversion work

---
 .../models/donut/convert_donut_to_pytorch.py  | 46 +++++++++++--------
 src/transformers/models/swin/modeling_swin.py |  7 +--
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index 97993aea34ae4..e93e2002bc9b2 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -12,10 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert DONUT checkpoints using the original `donut-python`library. URL: https://github.com/clovaai/donut"""
+"""Convert DONUT checkpoints using the original `donut-python` library. URL: https://github.com/clovaai/donut"""
 
 import argparse
 
+import torch
 from datasets import load_dataset
 from PIL import Image
 
@@ -113,7 +114,7 @@ def convert_state_dict(orig_state_dict, model):
                 orig_state_dict[
                     f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
                 ] = val[-dim:]
-        elif "attn_mask" in key:
+        elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]:
             # TODO check attn_mask buffers
             pass
         else:
@@ -122,13 +123,13 @@ def convert_state_dict(orig_state_dict, model):
     return orig_state_dict
 
 
-def convert_swin_checkpoint(model_name, pytorch_dump_folder_path):
+def convert_swin_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
     # load original model
     original_model = DonutModel.from_pretrained(model_name).eval()
 
     # load HuggingFace model
     encoder_config, decoder_config = get_configs(original_model)
-    encoder = SwinModel(encoder_config)
+    encoder = SwinModel(encoder_config, add_final_layer_norm=False)
     decoder = MBartForCausalLM(decoder_config)
     model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
     model.eval()
@@ -141,36 +142,39 @@ def convert_swin_checkpoint(model_name, pytorch_dump_folder_path):
     dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
     image = Image.open(dataset["test"][0]["file"]).convert("RGB")
 
+    # TODO create DonutProcessor (which combines a DonutFeatureExtractor and XLMRobertaTokenizer)
     pixel_values = original_model.encoder.prepare_input(image).unsqueeze(0)
 
     task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
     question = "When is the coffee break?"
     user_prompt = task_prompt.replace("{user_input}", question)
+    prompt_tensors = original_model.decoder.tokenizer(user_prompt, add_special_tokens=False, return_tensors="pt")[
+        "input_ids"
+    ]
 
     original_patch_embed = original_model.encoder.model.patch_embed(pixel_values)
-
-    print("Shape of original patch embeddings:", original_patch_embed.shape)
-    print("Original patch embeddings:", original_patch_embed[0, :3, :3])
-
     patch_embeddings, _ = model.encoder.embeddings(pixel_values)
+    assert torch.allclose(original_patch_embed, patch_embeddings, atol=1e-3)
 
-    print("Shape of patch embeddings:", patch_embeddings.shape)
-    print("HuggingFace patch embeddings:", patch_embeddings[0, :3, :3])
-
-    last_hidden_state = original_model.encoder(pixel_values)
-    print("First values of original last_hidden_state:", last_hidden_state[0, :3, :3])
-
-    outputs = model.encoder(pixel_values, output_hidden_states=True)
-    print("Shape of last hidden state HuggingFace one:", outputs.last_hidden_states[-1][0, :3, :3])
+    # verify encoder hidden states
+    original_last_hidden_state = original_model.encoder(pixel_values)
+    last_hidden_state = model.encoder(pixel_values).last_hidden_state
+    assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2)
 
-    # assert torch.allclose(last_hidden_state, outputs.last_hidden_state, atol=1e-3)
-    # print("Looks ok!")
+    # verify decoder hidden states
+    original_logits = original_model(pixel_values, prompt_tensors, None).logits
+    logits = model(pixel_values, decoder_input_ids=prompt_tensors).logits
+    assert torch.allclose(original_logits, logits, atol=1e-3)
+    print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
         print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
         # feature_extractor.save_pretrained(pytorch_dump_folder_path)
 
+    if push_to_hub:
+        model.push_to_hub(model_name.split("/")[-1], organization="nielsr")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -178,6 +182,7 @@ def convert_swin_checkpoint(model_name, pytorch_dump_folder_path):
     parser.add_argument(
         "--model_name",
         default="naver-clova-ix/donut-base-finetuned-docvqa",
+        required=False,
         type=str,
         help="Name of the original model you'd like to convert.",
     )
@@ -188,6 +193,9 @@ def convert_swin_checkpoint(model_name, pytorch_dump_folder_path):
         type=str,
         help="Path to the output PyTorch model directory.",
     )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
 
     args = parser.parse_args()
-    convert_swin_checkpoint(args.model_name, args.pytorch_dump_folder_path)
+    convert_swin_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index bc1ef11b81906..437dabdd7cb61 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -914,7 +914,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     SWIN_START_DOCSTRING,
 )
 class SwinModel(SwinPreTrainedModel):
-    def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
+    def __init__(self, config, add_pooling_layer=True, use_mask_token=False, add_final_layer_norm=True):
         super().__init__(config)
         self.config = config
         self.num_layers = len(config.depths)
@@ -923,7 +923,7 @@ def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
         self.embeddings = SwinEmbeddings(config, use_mask_token=use_mask_token)
         self.encoder = SwinEncoder(config, self.embeddings.patch_grid)
 
-        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
+        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps) if add_final_layer_norm else None
         self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
 
         # Initialize weights and apply final processing
@@ -986,7 +986,8 @@ def forward(
         )
 
         sequence_output = encoder_outputs[0]
-        sequence_output = self.layernorm(sequence_output)
+        if self.layernorm is not None:
+            sequence_output = self.layernorm(sequence_output)
 
         pooled_output = None
         if self.pooler is not None:

From b4f4c83dbaa8dc1086b89683b6fb0927183a5a97 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 24 Jul 2022 22:35:58 +0200
Subject: [PATCH 05/35] Add final_layer_norm attribute to Swin's config

---
 src/transformers/models/donut/convert_donut_to_pytorch.py | 3 ++-
 src/transformers/models/swin/configuration_swin.py        | 4 ++++
 src/transformers/models/swin/modeling_swin.py             | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index e93e2002bc9b2..7bd44e5e5e3eb 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -34,6 +34,7 @@ def get_configs(model):
         num_heads=[4, 8, 16, 32],
         window_size=original_config.window_size,
         embed_dim=128,
+        add_final_layer_norm=False,
     )
     decoder_config = MBartConfig(
         is_decoder=True,
@@ -129,7 +130,7 @@ def convert_swin_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_h
 
     # load HuggingFace model
     encoder_config, decoder_config = get_configs(original_model)
-    encoder = SwinModel(encoder_config, add_final_layer_norm=False)
+    encoder = SwinModel(encoder_config)
     decoder = MBartForCausalLM(decoder_config)
     model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
     model.eval()
diff --git a/src/transformers/models/swin/configuration_swin.py b/src/transformers/models/swin/configuration_swin.py
index 878a73e9208b5..5b3410ae8ce3f 100644
--- a/src/transformers/models/swin/configuration_swin.py
+++ b/src/transformers/models/swin/configuration_swin.py
@@ -75,6 +75,8 @@ class SwinConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        add_final_layer_norm (`bool`, *optional*, defaults to True):
+            Whether or not to add a final layer normalization layer after the stages.
         encoder_stride (`int`, `optional`, defaults to 32):
             Factor to increase the spatial resolution by in the decoder head for masked image modeling.
 
@@ -118,6 +120,7 @@ def __init__(
         patch_norm=True,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
+        add_final_layer_norm=True,
         encoder_stride=32,
         **kwargs
     ):
@@ -141,6 +144,7 @@ def __init__(
         self.path_norm = patch_norm
         self.layer_norm_eps = layer_norm_eps
         self.initializer_range = initializer_range
+        self.add_final_layer_norm = add_final_layer_norm
         self.encoder_stride = encoder_stride
         # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
         # this indicates the channel dimension after the last stage of the model
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index 437dabdd7cb61..6620e281cd204 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -914,7 +914,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     SWIN_START_DOCSTRING,
 )
 class SwinModel(SwinPreTrainedModel):
-    def __init__(self, config, add_pooling_layer=True, use_mask_token=False, add_final_layer_norm=True):
+    def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
         super().__init__(config)
         self.config = config
         self.num_layers = len(config.depths)
@@ -923,7 +923,7 @@ def __init__(self, config, add_pooling_layer=True, use_mask_token=False, add_fin
         self.embeddings = SwinEmbeddings(config, use_mask_token=use_mask_token)
         self.encoder = SwinEncoder(config, self.embeddings.patch_grid)
 
-        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps) if add_final_layer_norm else None
+        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps) if config.add_final_layer_norm else None
         self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
 
         # Initialize weights and apply final processing

From c2815470f98a888a527c270c73ec15faf88395dc Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 25 Jul 2022 12:38:46 +0200
Subject: [PATCH 06/35] Add DonutProcessor

---
 src/transformers/__init__.py                  |   4 +
 .../models/auto/feature_extraction_auto.py    |   1 +
 .../models/auto/processing_auto.py            |   1 +
 src/transformers/models/donut/__init__.py     |  48 +++++
 .../models/donut/convert_donut_to_pytorch.py  |  31 ++-
 .../models/donut/feature_extraction_donut.py  | 179 ++++++++++++++++++
 .../models/donut/processing_donut.py          |  75 ++++++++
 .../models/donut/test_feature_extractor.py    |  14 ++
 src/transformers/models/swin/modeling_swin.py |   4 +-
 9 files changed, 349 insertions(+), 8 deletions(-)
 create mode 100644 src/transformers/models/donut/__init__.py
 create mode 100644 src/transformers/models/donut/feature_extraction_donut.py
 create mode 100644 src/transformers/models/donut/processing_donut.py
 create mode 100644 src/transformers/models/donut/test_feature_extractor.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 2f53db07f078f..51e640e180fd2 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -190,6 +190,7 @@
     "models.dialogpt": [],
     "models.distilbert": ["DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DistilBertConfig", "DistilBertTokenizer"],
     "models.dit": [],
+    "models.donut": ["DonutProcessor"],
     "models.dpr": [
         "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "DPRConfig",
@@ -641,6 +642,7 @@
     _import_structure["models.convnext"].append("ConvNextFeatureExtractor")
     _import_structure["models.deit"].append("DeiTFeatureExtractor")
     _import_structure["models.detr"].append("DetrFeatureExtractor")
+    _import_structure["models.donut"].append("DonutFeatureExtractor")
     _import_structure["models.dpt"].append("DPTFeatureExtractor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaProcessor"])
     _import_structure["models.glpn"].append("GLPNFeatureExtractor")
@@ -2984,6 +2986,7 @@
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
     from .models.detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig
     from .models.distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, DistilBertTokenizer
+    from .models.donut import DonutProcessor
     from .models.dpr import (
         DPR_PRETRAINED_CONFIG_ARCHIVE_MAP,
         DPRConfig,
@@ -3375,6 +3378,7 @@
         from .models.convnext import ConvNextFeatureExtractor
         from .models.deit import DeiTFeatureExtractor
         from .models.detr import DetrFeatureExtractor
+        from .models.donut import DonutFeatureExtractor
         from .models.dpt import DPTFeatureExtractor
         from .models.flava import FlavaFeatureExtractor, FlavaProcessor
         from .models.glpn import GLPNFeatureExtractor
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index db581d03d8fb7..5c5f86d040c8f 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -46,6 +46,7 @@
         ("deit", "DeiTFeatureExtractor"),
         ("detr", "DetrFeatureExtractor"),
         ("detr", "DetrFeatureExtractor"),
+        ("donut", "DonutFeatureExtractor"),
         ("dpt", "DPTFeatureExtractor"),
         ("flava", "FlavaFeatureExtractor"),
         ("glpn", "GLPNFeatureExtractor"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index aed7b4b976137..c6f4fd98316a4 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -38,6 +38,7 @@
 PROCESSOR_MAPPING_NAMES = OrderedDict(
     [
         ("clip", "CLIPProcessor"),
+        ("donut", "DonutProcessor"),
         ("flava", "FlavaProcessor"),
         ("groupvit", "CLIPProcessor"),
         ("layoutlmv2", "LayoutLMv2Processor"),
diff --git a/src/transformers/models/donut/__init__.py b/src/transformers/models/donut/__init__.py
new file mode 100644
index 0000000000000..057dd49fff53f
--- /dev/null
+++ b/src/transformers/models/donut/__init__.py
@@ -0,0 +1,48 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_vision_available
+
+
+_import_structure = {"processing_donut": ["DonutProcessor"]}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["feature_extraction_donut"] = ["DonutFeatureExtractor"]
+
+
+if TYPE_CHECKING:
+    from .processing_donut import DonutProcessor
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_donut import DonutFeatureExtractor
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index 7bd44e5e5e3eb..f71fcd6be3ae4 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert DONUT checkpoints using the original `donut-python` library. URL: https://github.com/clovaai/donut"""
+"""Convert Donut checkpoints using the original `donut-python` library. URL: https://github.com/clovaai/donut"""
 
 import argparse
 
@@ -21,7 +21,16 @@
 from PIL import Image
 
 from donut import DonutModel
-from transformers import MBartConfig, MBartForCausalLM, SwinConfig, SwinModel, VisionEncoderDecoderModel
+from transformers import (
+    DonutFeatureExtractor,
+    DonutProcessor,
+    MBartConfig,
+    MBartForCausalLM,
+    SwinConfig,
+    SwinModel,
+    VisionEncoderDecoderModel,
+    XLMRobertaTokenizer,
+)
 
 
 def get_configs(model):
@@ -143,8 +152,13 @@ def convert_swin_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_h
     dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
     image = Image.open(dataset["test"][0]["file"]).convert("RGB")
 
-    # TODO create DonutProcessor (which combines a DonutFeatureExtractor and XLMRobertaTokenizer)
-    pixel_values = original_model.encoder.prepare_input(image).unsqueeze(0)
+    # TODO maybe verify pixel values against original implementation
+    # original_pixel_values = original_model.encoder.prepare_input(image).unsqueeze(0)
+
+    tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
+    feature_extractor = DonutFeatureExtractor()
+    processor = DonutProcessor(feature_extractor, tokenizer)
+    pixel_values = processor(image, return_tensors="pt").pixel_values
 
     task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
     question = "When is the coffee break?"
@@ -169,12 +183,13 @@ def convert_swin_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_h
     print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
-        print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")
+        print(f"Saving model and processor to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
-        # feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         model.push_to_hub(model_name.split("/")[-1], organization="nielsr")
+        processor.push_to_hub(model_name.split("/")[-1], organization="nielsr")
 
 
 if __name__ == "__main__":
@@ -195,7 +210,9 @@ def convert_swin_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_h
         help="Path to the output PyTorch model directory.",
     )
     parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+        "--push_to_hub",
+        action="store_true",
+        help="Whether or not to push the converted model and processor to the 🤗 hub.",
     )
 
     args = parser.parse_args()
diff --git a/src/transformers/models/donut/feature_extraction_donut.py b/src/transformers/models/donut/feature_extraction_donut.py
new file mode 100644
index 0000000000000..a139a4037a183
--- /dev/null
+++ b/src/transformers/models/donut/feature_extraction_donut.py
@@ -0,0 +1,179 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for Donut."""
+
+from typing import Optional, Tuple, Union
+
+import numpy as np
+from PIL import Image, ImageOps
+
+from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageFeatureExtractionMixin,
+    ImageInput,
+    is_torch_tensor,
+)
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DonutFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+    r"""
+    Constructs a Donut feature extractor.
+
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+    Args:
+        do_resize_and_thumbnail (`bool`, *optional*, defaults to `True`):
+            Whether to resize the shorter edge of the input to the minimum value of a certain `size`, and thumbnail the
+            input to the given `size`.
+        size (`Tuple(int)`, *optional*, defaults to [1920, 2560]):
+            Resize the shorter edge of the input to the minimum value of the given size. Should be a tuple of (width,
+            height). Only has an effect if `do_resize` is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`. Only has an effect
+            if `do_resize` is set to `True`.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether or not to pad the input to `size`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        image_mean (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
+            The sequence of means for each channel, to be used when normalizing images.
+        image_std (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize_and_thumbnail=True,
+        size=[1920, 2560],
+        resample=Image.BILINEAR,
+        do_pad=True,
+        do_normalize=True,
+        image_mean=None,
+        image_std=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.do_resize_and_thumbnail = do_resize_and_thumbnail
+        self.size = size
+        self.resample = resample
+        self.do_pad = do_pad
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def resize_and_thumbnail(self, image, size, resample):
+        # resize the shorter edge of the image to `min(size)`
+        image = self.resize(image, size=min(size), resample=resample, default_to_square=False)
+        # create a thumbnail
+        image.thumbnail((size[0], size[1]))
+
+        return image
+
+    def pad(self, image: Image.Image, size: Tuple[int, int]) -> Image.Image:
+        delta_width = size[0] - image.width
+        delta_height = size[1] - image.height
+
+        pad_width = delta_width // 2
+        pad_height = delta_height // 2
+        padding = (
+            pad_width,
+            pad_height,
+            delta_width - pad_width,
+            delta_height - pad_height,
+        )
+        return ImageOps.expand(image, padding)
+
+    def __call__(
+        self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several image(s).
+
+        <Tip warning={true}>
+
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
+              width).
+        """
+        # Input type checking for clearer error
+        valid_images = False
+
+        # Check that images has a valid type
+        if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
+            valid_images = True
+        elif isinstance(images, (list, tuple)):
+            if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
+                valid_images = True
+
+        if not valid_images:
+            raise ValueError(
+                "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
+                "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
+            )
+
+        is_batched = bool(
+            isinstance(images, (list, tuple))
+            and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
+        )
+
+        if not is_batched:
+            images = [images]
+
+        # transformations (resizing + padding + normalization)
+        if self.do_resize_and_thumbnail and self.size is not None:
+            images = [
+                self.resize_and_thumbnail(image=image, size=self.size, resample=self.resample) for image in images
+            ]
+        if self.do_pad and self.size is not None:
+            images = [self.pad(image=image, size=self.size) for image in images]
+        if self.do_normalize:
+            images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
+
+        # return as BatchFeature
+        data = {"pixel_values": images}
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        return encoded_inputs
diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
new file mode 100644
index 0000000000000..d56972872437b
--- /dev/null
+++ b/src/transformers/models/donut/processing_donut.py
@@ -0,0 +1,75 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Donut.
+"""
+from contextlib import contextmanager
+
+from ...processing_utils import ProcessorMixin
+
+
+class DonutProcessor(ProcessorMixin):
+    r"""
+    Constructs a Donut processor which wraps a Donut feature extractor and an XLMRoBERTa tokenizer into a single
+    processor.
+
+    [`DonutProcessor`] offers all the functionalities of [`DonutFeatureExtractor`] and
+    [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. See the [`~DonutProcessor.__call__`] and
+    [`~DonutProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor ([`DonutFeatureExtractor`]):
+            An instance of [`DonutFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]):
+            An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
+    """
+    feature_extractor_class = "AutoFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+
+    def __call__(self, *args, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
+        [`~AutoFeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
+        [`~DonutTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
+        """
+        return self.current_processor(*args, **kwargs)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to DonutTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to DonutTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
+        docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @contextmanager
+    def as_target_processor(self):
+        """
+        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning Donut.
+        """
+        self.current_processor = self.tokenizer
+        yield
+        self.current_processor = self.feature_extractor
diff --git a/src/transformers/models/donut/test_feature_extractor.py b/src/transformers/models/donut/test_feature_extractor.py
new file mode 100644
index 0000000000000..b8b1d8cebb24e
--- /dev/null
+++ b/src/transformers/models/donut/test_feature_extractor.py
@@ -0,0 +1,14 @@
+from datasets import load_dataset
+from PIL import Image
+
+from transformers import DonutFeatureExtractor
+
+
+dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
+image = Image.open(dataset["test"][0]["file"]).convert("RGB")
+
+feature_extractor = DonutFeatureExtractor()
+
+encoding = feature_extractor(image, return_tensors="pt")
+
+print(encoding.pixel_values.shape)
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index 6620e281cd204..662cdddb8b0fe 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -923,7 +923,9 @@ def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
         self.embeddings = SwinEmbeddings(config, use_mask_token=use_mask_token)
         self.encoder = SwinEncoder(config, self.embeddings.patch_grid)
 
-        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps) if config.add_final_layer_norm else None
+        self.layernorm = (
+            nn.LayerNorm(self.num_features, eps=config.layer_norm_eps) if config.add_final_layer_norm else None
+        )
         self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
 
         # Initialize weights and apply final processing

From 0a1c659ae896e5a3e983e498232172910b833af7 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 25 Jul 2022 16:21:30 +0200
Subject: [PATCH 07/35] Convert more models

---
 .../models/donut/convert_donut_to_pytorch.py  | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index f71fcd6be3ae4..7d0c6d587beb3 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -29,7 +29,7 @@
     SwinConfig,
     SwinModel,
     VisionEncoderDecoderModel,
-    XLMRobertaTokenizer,
+    XLMRobertaTokenizerFast,
 )
 
 
@@ -155,15 +155,20 @@ def convert_swin_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_h
     # TODO maybe verify pixel values against original implementation
     # original_pixel_values = original_model.encoder.prepare_input(image).unsqueeze(0)
 
-    tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
-    feature_extractor = DonutFeatureExtractor()
+    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
+    feature_extractor = DonutFeatureExtractor(size=original_model.config.input_size[::-1])
     processor = DonutProcessor(feature_extractor, tokenizer)
     pixel_values = processor(image, return_tensors="pt").pixel_values
 
-    task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
-    question = "When is the coffee break?"
-    user_prompt = task_prompt.replace("{user_input}", question)
-    prompt_tensors = original_model.decoder.tokenizer(user_prompt, add_special_tokens=False, return_tensors="pt")[
+    if "vqa" in model_name:
+        task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
+        question = "When is the coffee break?"
+        task_prompt = task_prompt.replace("{user_input}", question)
+    elif "rvlcdip" in model_name:
+        task_prompt = "<s_rvlcdip>"
+    elif "cord-v2" in model_name:
+        task_prompt = "s_cord-v2>"
+    prompt_tensors = original_model.decoder.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")[
         "input_ids"
     ]
 

From 910b6f8cb9932110a593588a73b30b033aa0b71d Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 26 Jul 2022 11:42:43 +0200
Subject: [PATCH 08/35] Improve feature extractor and convert base models

---
 .../models/donut/convert_donut_to_pytorch.py  | 19 ++++---
 .../models/donut/feature_extraction_donut.py  | 54 ++++++++++++++-----
 .../models/donut/test_feature_extractor.py    |  2 +-
 3 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index 7d0c6d587beb3..57d893a6d5dd1 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -133,7 +133,7 @@ def convert_state_dict(orig_state_dict, model):
     return orig_state_dict
 
 
-def convert_swin_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
+def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
     # load original model
     original_model = DonutModel.from_pretrained(model_name).eval()
 
@@ -156,18 +156,25 @@ def convert_swin_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_h
     # original_pixel_values = original_model.encoder.prepare_input(image).unsqueeze(0)
 
     tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
-    feature_extractor = DonutFeatureExtractor(size=original_model.config.input_size[::-1])
+    feature_extractor = DonutFeatureExtractor(
+        do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1]
+    )
     processor = DonutProcessor(feature_extractor, tokenizer)
     pixel_values = processor(image, return_tensors="pt").pixel_values
 
-    if "vqa" in model_name:
+    if model_name == "naver-clova-ix/donut-base-finetuned-docvqa":
         task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
         question = "When is the coffee break?"
         task_prompt = task_prompt.replace("{user_input}", question)
-    elif "rvlcdip" in model_name:
+    elif model_name == "naver-clova-ix/donut-base-finetuned-rvlcdip":
         task_prompt = "<s_rvlcdip>"
-    elif "cord-v2" in model_name:
+    elif model_name == "naver-clova-ix/donut-base-finetuned-cord-v2":
         task_prompt = "s_cord-v2>"
+    elif model_name in ["naver-clova-ix/donut-proto", "naver-clova-ix/donut-base"]:
+        # use a random prompt
+        task_prompt = "hello world"
+    else:
+        raise ValueError("Model name not supported")
     prompt_tensors = original_model.decoder.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")[
         "input_ids"
     ]
@@ -221,4 +228,4 @@ def convert_swin_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_h
     )
 
     args = parser.parse_args()
-    convert_swin_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
+    convert_donut_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/donut/feature_extraction_donut.py b/src/transformers/models/donut/feature_extraction_donut.py
index a139a4037a183..54e2fc2c657f3 100644
--- a/src/transformers/models/donut/feature_extraction_donut.py
+++ b/src/transformers/models/donut/feature_extraction_donut.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """Feature extractor class for Donut."""
 
-from typing import Optional, Tuple, Union
+from typing import Bool, Optional, Tuple, Union
 
 import numpy as np
 from PIL import Image, ImageOps
@@ -41,7 +41,7 @@ class DonutFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin)
     should refer to this superclass for more information regarding those methods.
 
     Args:
-        do_resize_and_thumbnail (`bool`, *optional*, defaults to `True`):
+        do_resize (`bool`, *optional*, defaults to `True`):
             Whether to resize the shorter edge of the input to the minimum value of a certain `size`, and thumbnail the
             input to the given `size`.
         size (`Tuple(int)`, *optional*, defaults to [1920, 2560]):
@@ -51,6 +51,8 @@ class DonutFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin)
             An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
             `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`. Only has an effect
             if `do_resize` is set to `True`.
+        do_align_long_axis (`bool`, *optional*, defaults to `False`):
+            Whether to rotate the input if the height is greater than width.
         do_pad (`bool`, *optional*, defaults to `True`):
             Whether or not to pad the input to `size`.
         do_normalize (`bool`, *optional*, defaults to `True`):
@@ -59,15 +61,17 @@ class DonutFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin)
             The sequence of means for each channel, to be used when normalizing images.
         image_std (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
             The sequence of standard deviations for each channel, to be used when normalizing images.
+
     """
 
     model_input_names = ["pixel_values"]
 
     def __init__(
         self,
-        do_resize_and_thumbnail=True,
+        do_resize=True,
         size=[1920, 2560],
         resample=Image.BILINEAR,
+        do_align_long_axis=False,
         do_pad=True,
         do_normalize=True,
         image_mean=None,
@@ -75,28 +79,43 @@ def __init__(
         **kwargs
     ):
         super().__init__(**kwargs)
-        self.do_resize_and_thumbnail = do_resize_and_thumbnail
+        self.do_resize = do_resize
         self.size = size
         self.resample = resample
+        self.do_align_long_axis = do_align_long_axis
         self.do_pad = do_pad
         self.do_normalize = do_normalize
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
 
+    def rotate(self, image, size):
+        if not isinstance(image, Image.Image):
+            image = self.to_pil_image(image)
+
+        if (size[1] > size[0] and image.width > image.height) or (size[1] < size[0] and image.width < image.height):
+            image = image.rotate(angle=-90, expand=True)
+
+        return image
+
     def resize_and_thumbnail(self, image, size, resample):
-        # resize the shorter edge of the image to `min(size)`
+        # 1. resize the shorter edge of the image to `min(size)`
         image = self.resize(image, size=min(size), resample=resample, default_to_square=False)
-        # create a thumbnail
+        # 2. create a thumbnail
         image.thumbnail((size[0], size[1]))
 
         return image
 
-    def pad(self, image: Image.Image, size: Tuple[int, int]) -> Image.Image:
+    def pad(self, image: Image.Image, size: Tuple[int, int], random_padding: Bool = False) -> Image.Image:
         delta_width = size[0] - image.width
         delta_height = size[1] - image.height
 
-        pad_width = delta_width // 2
-        pad_height = delta_height // 2
+        if random_padding:
+            pad_width = np.random.randint(low=0, high=delta_width + 1)
+            pad_height = np.random.randint(low=0, high=delta_height + 1)
+        else:
+            pad_width = delta_width // 2
+            pad_height = delta_height // 2
+
         padding = (
             pad_width,
             pad_height,
@@ -106,7 +125,11 @@ def pad(self, image: Image.Image, size: Tuple[int, int]) -> Image.Image:
         return ImageOps.expand(image, padding)
 
     def __call__(
-        self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
+        self,
+        images: ImageInput,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        random_padding=False,
+        **kwargs
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several image(s).
@@ -124,6 +147,9 @@ def __call__(
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
 
+            random_padding (`bool`, *optional*, defaults to `False`):
+                Whether to randomly pad the input to `size`.
+
             return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
                 If set, will return tensors of a particular framework. Acceptable values are:
 
@@ -162,13 +188,15 @@ def __call__(
         if not is_batched:
             images = [images]
 
-        # transformations (resizing + padding + normalization)
-        if self.do_resize_and_thumbnail and self.size is not None:
+        # transformations (rotating + resizing + padding + normalization)
+        if self.do_align_long_axis:
+            images = [self.rotate(image, self.size) for image in images]
+        if self.do_resize and self.size is not None:
             images = [
                 self.resize_and_thumbnail(image=image, size=self.size, resample=self.resample) for image in images
             ]
         if self.do_pad and self.size is not None:
-            images = [self.pad(image=image, size=self.size) for image in images]
+            images = [self.pad(image=image, size=self.size, random_padding=random_padding) for image in images]
         if self.do_normalize:
             images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
 
diff --git a/src/transformers/models/donut/test_feature_extractor.py b/src/transformers/models/donut/test_feature_extractor.py
index b8b1d8cebb24e..b6eb4ce546a44 100644
--- a/src/transformers/models/donut/test_feature_extractor.py
+++ b/src/transformers/models/donut/test_feature_extractor.py
@@ -7,7 +7,7 @@
 dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
 image = Image.open(dataset["test"][0]["file"]).convert("RGB")
 
-feature_extractor = DonutFeatureExtractor()
+feature_extractor = DonutFeatureExtractor(do_align_long_axis=True)
 
 encoding = feature_extractor(image, return_tensors="pt")
 

From 809d0a212026e8d2d8e4fd17405cc2e760659189 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 26 Jul 2022 11:44:56 +0200
Subject: [PATCH 09/35] Fix bug

---
 src/transformers/models/donut/feature_extraction_donut.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/donut/feature_extraction_donut.py b/src/transformers/models/donut/feature_extraction_donut.py
index 54e2fc2c657f3..6bcd0c2f25b62 100644
--- a/src/transformers/models/donut/feature_extraction_donut.py
+++ b/src/transformers/models/donut/feature_extraction_donut.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """Feature extractor class for Donut."""
 
-from typing import Bool, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 from PIL import Image, ImageOps
@@ -105,7 +105,7 @@ def resize_and_thumbnail(self, image, size, resample):
 
         return image
 
-    def pad(self, image: Image.Image, size: Tuple[int, int], random_padding: Bool = False) -> Image.Image:
+    def pad(self, image: Image.Image, size: Tuple[int, int], random_padding: bool = False) -> Image.Image:
         delta_width = size[0] - image.width
         delta_height = size[1] - image.height
 

From 5ef32738e79159348829f3ebc0f3d46b108d7770 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 28 Jul 2022 16:09:38 +0200
Subject: [PATCH 10/35] Improve integration tests

---
 .../test_modeling_vision_encoder_decoder.py   | 142 +++++++++++++++++-
 1 file changed, 135 insertions(+), 7 deletions(-)

diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
index 320cdd6330626..dfa5d2a11bf1f 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@@ -13,14 +13,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import re
 import tempfile
 import unittest
 
 from datasets import load_dataset
 from packaging import version
 
-from transformers.testing_utils import require_torch, require_vision, slow, to_2tuple, torch_device
+from transformers import DonutProcessor, TrOCRProcessor
+from transformers.testing_utils import (
+    require_sentencepiece,
+    require_torch,
+    require_vision,
+    slow,
+    to_2tuple,
+    torch_device,
+)
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
@@ -54,7 +62,7 @@
     import PIL
     from PIL import Image
 
-    from transformers import TrOCRProcessor, ViTFeatureExtractor
+    from transformers import ViTFeatureExtractor
 
 
 @require_torch
@@ -654,8 +662,8 @@ def default_processor(self):
     def test_inference_handwritten(self):
         model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(torch_device)
 
-        ds = load_dataset("hf-internal-testing/fixtures_ocr", split="test")
-        image = Image.open(ds[0]["file"]).convert("RGB")
+        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test")
+        image = Image.open(dataset[0]["file"]).convert("RGB")
 
         processor = self.default_processor
         pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
@@ -679,8 +687,8 @@ def test_inference_handwritten(self):
     def test_inference_printed(self):
         model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").to(torch_device)
 
-        ds = load_dataset("hf-internal-testing/fixtures_ocr", split="test")
-        image = Image.open(ds[1]["file"]).convert("RGB")
+        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test")
+        image = Image.open(dataset[1]["file"]).convert("RGB")
 
         processor = self.default_processor
         pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
@@ -774,3 +782,123 @@ def generate_step(pixel_values):
         # should produce
         # ["a cat laying on top of a couch next to another cat"]
         self.assertEqual(preds, ["a cat laying on top of a couch next to another cat"])
+
+
+@require_vision
+@require_torch
+@require_sentencepiece
+class DonutModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_docvqa(self):
+        processor = DonutProcessor.from_pretrained("nielsr/donut-base-finetuned-docvqa")
+        model = VisionEncoderDecoderModel.from_pretrained("nielsr/donut-base-finetuned-docvqa").to(torch_device)
+
+        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+        image = dataset[0]["image"]
+
+        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
+        decoder_input_ids = processor.tokenizer(
+            "<s_docvqa>", add_special_tokens=False, return_tensors="pt"
+        ).input_ids.to(torch_device)
+
+        # step 1: single forward pass
+        with torch.no_grad():
+            outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
+            logits = outputs.logits
+
+        # verify the logits
+        expected_shape = torch.Size([1, 3, 57532])
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([24.2731, -6.4522, 32.4130]).to(torch_device)
+
+        self.assertTrue(torch.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))
+
+        # step 2: generation
+        task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
+        question = "When is the coffee break?"
+        prompt = task_prompt.replace("{user_input}", question)
+        decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids.to(
+            torch_device
+        )
+
+        outputs = model.generate(
+            pixel_values,
+            decoder_input_ids=decoder_input_ids,
+            max_length=model.decoder.config.max_position_embeddings,
+            early_stopping=True,
+            pad_token_id=processor.tokenizer.pad_token_id,
+            eos_token_id=processor.tokenizer.eos_token_id,
+            use_cache=True,
+            num_beams=1,
+            bad_words_ids=[[processor.tokenizer.unk_token_id]],
+            output_scores=True,
+        )
+        sequence = processor.batch_decode(outputs.sequences)[0]
+        sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+
+        # verify generated sequence
+        self.assertEqual(
+            sequence, ["<s_question> When is the coffee break?</s_question><s_answer> 11-14 to 11:39 a.m.</s_answer>"]
+        )
+
+        # verify scores
+        self.assertEqual(len(outputs.scores), 11)
+        self.assertTrue(
+            torch.allclose(
+                outputs.scores[0][0, :3], torch.tensor([5.3153, -3.5276, 13.4781], device=torch_device), atol=1e-4
+            )
+        )
+
+    @slow
+    def test_inference_cordv2(self):
+        processor = DonutProcessor.from_pretrained("nielsr/donut-base-finetuned-cord-v2")
+        model = VisionEncoderDecoderModel.from_pretrained("nielsr/donut-base-finetuned-cord-v2").to(torch_device)
+
+        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+        image = dataset[1]["image"]
+
+        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
+        decoder_input_ids = processor.tokenizer(
+            "<s_cord-v2>", add_special_tokens=False, return_tensors="pt"
+        ).input_ids.to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
+            logits = outputs.logits
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1, model.decoder.config.vocab_size))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-24.4937, -2.8204, -15.3401], device=torch_device)
+
+        self.assertTrue(torch.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_rvlcdip(self):
+        processor = DonutProcessor.from_pretrained("nielsr/donut-base-finetuned-rvlcdip")
+        model = VisionEncoderDecoderModel.from_pretrained("nielsr/donut-base-finetuned-rvlcdip").to(torch_device)
+
+        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+        image = dataset[1]["image"]
+
+        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
+
+        # forward pass
+        decoder_input_ids = processor.tokenizer(
+            "<s_rvlcdip>", add_special_tokens=False, return_tensors="pt"
+        ).input_ids.to(torch_device)
+        with torch.no_grad():
+            outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
+            logits = outputs.logits
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1, model.decoder.config.vocab_size))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-17.6490, -4.8381, -15.7577], device=torch_device)
+
+        self.assertTrue(torch.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))

From 1be072cbb69dd84cb039086d7396c2161830572a Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 29 Jul 2022 12:25:37 +0200
Subject: [PATCH 11/35] Improve integration tests and add model to README

---
 README.md                                     |  1 +
 README_ko.md                                  |  1 +
 README_zh-hans.md                             |  1 +
 README_zh-hant.md                             |  1 +
 docs/source/en/index.mdx                      |  1 +
 docs/source/en/model_doc/donut.mdx            | 79 ++++++++++++++++++
 .../utils/dummy_vision_objects.py             |  7 ++
 .../test_modeling_vision_encoder_decoder.py   | 81 +++++++++++++++++--
 8 files changed, 164 insertions(+), 8 deletions(-)
 create mode 100644 docs/source/en/model_doc/donut.mdx

diff --git a/README.md b/README.md
index 46a4b07c14cd3..30bc6d870bbf0 100644
--- a/README.md
+++ b/README.md
@@ -286,6 +286,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[Donut](https://huggingface.co/docs/transformers/main/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
diff --git a/README_ko.md b/README_ko.md
index c63fdca749da8..cc0b790ad76a8 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -242,6 +242,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT.
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[Donut](https://huggingface.co/docs/transformers/main/model_doc/donut)** (from NAVER) released with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 0ab06bd96ad99..fe2fa45f71f39 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -266,6 +266,7 @@ conda install -c huggingface transformers
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) 和德语版 DistilBERT。
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (来自 Microsoft Research) 伴随论文 [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) 由 Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei 发布。
+1. **[Donut](https://huggingface.co/docs/transformers/main/model_doc/donut)** (来自 NAVER) 伴随论文 [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 由 Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park 发布。
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (来自 Intel Labs) 伴随论文 [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) 由 René Ranftl, Alexey Bochkovskiy, Vladlen Koltun 发布。
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 90f29ad031b8b..4f5a995476149 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -278,6 +278,7 @@ conda install -c huggingface transformers
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT.
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[Donut](https://huggingface.co/docs/transformers/main/model_doc/donut)** (from NAVER) released with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 5c0d51d8b7afb..2f5ffb67eab53 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -84,6 +84,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
 1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
 1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
diff --git a/docs/source/en/model_doc/donut.mdx b/docs/source/en/model_doc/donut.mdx
new file mode 100644
index 0000000000000..ba4a7a249b5ce
--- /dev/null
+++ b/docs/source/en/model_doc/donut.mdx
@@ -0,0 +1,79 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# Donut
+
+## Overview
+
+The Donut model was proposed in [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by
+Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
+Donut consists of an image Transformer encoder and an autoregressive text Transformer decoder to perform document understanding
+tasks such as document image classification, form understanding and visual question answering.
+
+The abstract from the paper is the following:
+
+*Understanding document images (e.g., invoices) is a core but challenging task since it requires complex functions such as reading text and a holistic understanding of the document. Current Visual Document Understanding (VDU) methods outsource the task of reading text to off-the-shelf Optical Character Recognition (OCR) engines and focus on the understanding task with the OCR outputs. Although such OCR-based approaches have shown promising performance, they suffer from 1) high computational costs for using OCR; 2) inflexibility of OCR models on languages or types of document; 3) OCR error propagation to the subsequent process. To address these issues, in this paper, we introduce a novel OCR-free VDU model named Donut, which stands for Document understanding transformer. As the first step in OCR-free VDU research, we propose a simple architecture (i.e., Transformer) with a pre-training objective (i.e., cross-entropy loss). Donut is conceptually simple yet effective. Through extensive experiments and analyses, we show a simple OCR-free VDU model, Donut, achieves state-of-the-art performances on various VDU tasks in terms of both speed and accuracy. In addition, we offer a synthetic data generator that helps the model pre-training to be flexible in various languages and domains.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/donut_architecture.jpg"
+alt="drawing" width="600"/> 
+
+<small> Donut high-level overview. Taken from the <a href="https://arxiv.org/abs/2111.15664">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
+[here](https://github.com/clovaai/donut).
+
+Tips:
+
+- The quickest way to get started with Donut is by checking the [tutorial
+  notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/donut), which show how to use the model
+  at inference time as well as fine-tuning on custom data.
+- Donut is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework.
+
+## Inference
+
+Donut's [`VisionEncoderDecoder`] model accepts images as input and makes use of
+[`~generation_utils.GenerationMixin.generate`] to autoregressively generate text given the input image.
+
+The [`DonutFeatureExtractor`] class is responsible for preprocessing the input image and
+[`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`] decodes the generated target tokens to the target string. The
+[`DonutProcessor`] wraps [`DonutFeatureExtractor`] and [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]
+into a single instance to both extract the input features and decode the predicted token ids.
+
+- Step-by-step Document Visual Question Answering (DocVQA)
+
+``` py
+>>> from transformers import DonutProcessor, VisionEncoderDecoderModel
+>>> from datasets import load_dataset
+
+>>> processor = DonutProcessor.from_pretrained("microsoft/trocr-base-handwritten") 
+>>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
+
+>>> # load document image from the DocVQA dataset
+>>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+>>> image = dataset[0]["image"]
+
+>>> pixel_values = processor(image, return_tensors="pt").pixel_values 
+>>> generated_ids = model.generate(pixel_values)
+
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] 
+```
+
+See the [model hub](https://huggingface.co/models?filter=donut) to look for Donut checkpoints.
+
+## DonutProcessor
+
+[[autodoc]] DonutProcessor
+    - __call__
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+    - as_target_processor
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 30228e022222b..fa30432070a37 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -52,6 +52,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class DonutFeatureExtractor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class DPTFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
index dfa5d2a11bf1f..a21c6dc967e12 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@@ -811,16 +811,14 @@ def test_inference_docvqa(self):
         self.assertEqual(outputs.logits.shape, expected_shape)
 
         expected_slice = torch.tensor([24.2731, -6.4522, 32.4130]).to(torch_device)
-
         self.assertTrue(torch.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))
 
         # step 2: generation
         task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
         question = "When is the coffee break?"
         prompt = task_prompt.replace("{user_input}", question)
-        decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids.to(
-            torch_device
-        )
+        decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
+        decoder_input_ids = decoder_input_ids.to(torch_device)
 
         outputs = model.generate(
             pixel_values,
@@ -864,7 +862,7 @@ def test_inference_cordv2(self):
             "<s_cord-v2>", add_special_tokens=False, return_tensors="pt"
         ).input_ids.to(torch_device)
 
-        # forward pass
+        # step 1: single forward pass
         with torch.no_grad():
             outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
             logits = outputs.logits
@@ -874,9 +872,44 @@ def test_inference_cordv2(self):
         self.assertEqual(outputs.logits.shape, expected_shape)
 
         expected_slice = torch.tensor([-24.4937, -2.8204, -15.3401], device=torch_device)
-
         self.assertTrue(torch.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))
 
+        # step 2: generation
+        task_prompt = "<s_cord-v2>"
+        decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
+        decoder_input_ids = decoder_input_ids.to(torch_device)
+
+        outputs = model.generate(
+            pixel_values,
+            decoder_input_ids=decoder_input_ids,
+            max_length=model.decoder.config.max_position_embeddings,
+            early_stopping=True,
+            pad_token_id=processor.tokenizer.pad_token_id,
+            eos_token_id=processor.tokenizer.eos_token_id,
+            use_cache=True,
+            num_beams=1,
+            bad_words_ids=[[processor.tokenizer.unk_token_id]],
+            output_scores=True,
+        )
+
+        sequence = processor.batch_decode(outputs.sequences)[0]
+        sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+
+        # verify generated sequence
+        # fmt: off
+        expected_sequence = "<s_menu><s_nm> CINNAMON SUGAR</s_nm><s_unitprice> 17,000</s_unitprice><s_cnt> 1 x</s_cnt><s_price> 17,000</s_price></s_menu><s_sub_total><s_subtotal_price> 17,000</s_subtotal_price></s_sub_total><s_total><s_total_price> 17,000</s_total_price><s_cashprice> 20,000</s_cashprice><s_changeprice> 3,000</s_changeprice></s_total>"  # noqa: E231
+        # fmt: on
+        self.assertEqual(sequence, expected_sequence)
+
+        # verify scores
+        self.assertEqual(len(outputs.scores), 43)
+        self.assertTrue(
+            torch.allclose(
+                outputs.scores[0][0, :3], torch.tensor([-27.4344, -3.2686, -19.3524], device=torch_device), atol=1e-4
+            )
+        )
+
     @slow
     def test_inference_rvlcdip(self):
         processor = DonutProcessor.from_pretrained("nielsr/donut-base-finetuned-rvlcdip")
@@ -887,7 +920,7 @@ def test_inference_rvlcdip(self):
 
         pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
 
-        # forward pass
+        # step 1: single forward pass
         decoder_input_ids = processor.tokenizer(
             "<s_rvlcdip>", add_special_tokens=False, return_tensors="pt"
         ).input_ids.to(torch_device)
@@ -900,5 +933,37 @@ def test_inference_rvlcdip(self):
         self.assertEqual(outputs.logits.shape, expected_shape)
 
         expected_slice = torch.tensor([-17.6490, -4.8381, -15.7577], device=torch_device)
-
         self.assertTrue(torch.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))
+
+        # step 2: generation
+        task_prompt = "<s_rvlcdip>"
+        decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
+        decoder_input_ids = decoder_input_ids.to(torch_device)
+
+        outputs = model.generate(
+            pixel_values,
+            decoder_input_ids=decoder_input_ids,
+            max_length=model.decoder.config.max_position_embeddings,
+            early_stopping=True,
+            pad_token_id=processor.tokenizer.pad_token_id,
+            eos_token_id=processor.tokenizer.eos_token_id,
+            use_cache=True,
+            num_beams=1,
+            bad_words_ids=[[processor.tokenizer.unk_token_id]],
+            output_scores=True,
+        )
+
+        sequence = processor.batch_decode(outputs.sequences)[0]
+        sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+
+        # verify generated sequence
+        self.assertEqual(sequence, ["<s_class><advertisement/></s_class>"])
+
+        # verify scores
+        self.assertEqual(len(outputs.scores), 4)
+        self.assertTrue(
+            torch.allclose(
+                outputs.scores[0][0, :3], torch.tensor([-17.6490, -4.8381, -15.7577], device=torch_device), atol=1e-4
+            )
+        )

From df405b2fa5a2e52684825a95fcaa725af9937f60 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 29 Jul 2022 12:51:03 +0200
Subject: [PATCH 12/35] Add doc test

---
 docs/source/en/model_doc/donut.mdx | 34 ++++++++++++++++++++++++++----
 utils/documentation_tests.txt      |  1 +
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/model_doc/donut.mdx b/docs/source/en/model_doc/donut.mdx
index ba4a7a249b5ce..a8ab946d9f7a5 100644
--- a/docs/source/en/model_doc/donut.mdx
+++ b/docs/source/en/model_doc/donut.mdx
@@ -50,20 +50,46 @@ into a single instance to both extract the input features and decode the predict
 - Step-by-step Document Visual Question Answering (DocVQA)
 
 ``` py
+>>> import re
+
 >>> from transformers import DonutProcessor, VisionEncoderDecoderModel
 >>> from datasets import load_dataset
+>>> import torch
+
+>>> processor = DonutProcessor.from_pretrained("nielsr/donut-base-finetuned-docvqa") 
+>>> model = VisionEncoderDecoderModel.from_pretrained("nielsr/donut-base-finetuned-docvqa")
 
->>> processor = DonutProcessor.from_pretrained("microsoft/trocr-base-handwritten") 
->>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model.to(device)
 
 >>> # load document image from the DocVQA dataset
 >>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
 >>> image = dataset[0]["image"]
 
+>>> # prepare decoder inputs
+>>> task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
+>>> question = "When is the coffee break?"
+>>> prompt = task_prompt.replace("{user_input}", question)
+>>> decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
+
 >>> pixel_values = processor(image, return_tensors="pt").pixel_values 
->>> generated_ids = model.generate(pixel_values)
 
->>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] 
+>>> outputs = model.generate(pixel_values.to(device),
+...                          decoder_input_ids=decoder_input_ids.to(device),
+...                          max_length=model.decoder.config.max_position_embeddings,
+...                          early_stopping=True,
+...                          pad_token_id=processor.tokenizer.pad_token_id,
+...                          eos_token_id=processor.tokenizer.eos_token_id,
+...                          use_cache=True,
+...                          num_beams=1,
+...                          bad_words_ids=[[processor.tokenizer.unk_token_id]],
+...                          return_dict_in_generate=True)
+
+>>> sequence = processor.batch_decode(outputs.sequences)[0]
+>>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+>>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+>>> print(sequence)
+<s_question> When is the coffee break?</s_question><s_answer> 11-14 to 11:39 a.m.</s_answer>
 ```
 
 See the [model hub](https://huggingface.co/models?filter=donut) to look for Donut checkpoints.
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 1941a7343a6bc..0edda8ae5a4c3 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -8,6 +8,7 @@ docs/source/en/model_doc/t5.mdx
 docs/source/en/model_doc/t5v1.1.mdx
 docs/source/en/model_doc/byt5.mdx
 docs/source/en/model_doc/tapex.mdx
+docs/source/en/model_doc/donut.mdx
 docs/source/en/model_doc/encoder-decoder.mdx
 src/transformers/generation_utils.py
 src/transformers/models/albert/modeling_albert.py

From e890d41093a704365c45efa7ba4b4f0d4e232676 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 29 Jul 2022 12:57:14 +0200
Subject: [PATCH 13/35] Add feature extractor to docs

---
 docs/source/en/model_doc/donut.mdx                 |  6 ++++++
 .../models/donut/test_feature_extractor.py         | 14 --------------
 2 files changed, 6 insertions(+), 14 deletions(-)
 delete mode 100644 src/transformers/models/donut/test_feature_extractor.py

diff --git a/docs/source/en/model_doc/donut.mdx b/docs/source/en/model_doc/donut.mdx
index a8ab946d9f7a5..8af288873bddd 100644
--- a/docs/source/en/model_doc/donut.mdx
+++ b/docs/source/en/model_doc/donut.mdx
@@ -94,6 +94,12 @@ into a single instance to both extract the input features and decode the predict
 
 See the [model hub](https://huggingface.co/models?filter=donut) to look for Donut checkpoints.
 
+
+## DonutFeatureExtractor
+
+[[autodoc]] DonutFeatureExtractor
+    - __call__
+
 ## DonutProcessor
 
 [[autodoc]] DonutProcessor
diff --git a/src/transformers/models/donut/test_feature_extractor.py b/src/transformers/models/donut/test_feature_extractor.py
deleted file mode 100644
index b6eb4ce546a44..0000000000000
--- a/src/transformers/models/donut/test_feature_extractor.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from datasets import load_dataset
-from PIL import Image
-
-from transformers import DonutFeatureExtractor
-
-
-dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
-image = Image.open(dataset["test"][0]["file"]).convert("RGB")
-
-feature_extractor = DonutFeatureExtractor(do_align_long_axis=True)
-
-encoding = feature_extractor(image, return_tensors="pt")
-
-print(encoding.pixel_values.shape)

From 3c88b6954cac1dd0bf928408df4f385d101de9ec Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 4 Aug 2022 15:06:12 +0200
Subject: [PATCH 14/35] Fix integration tests

---
 src/transformers/models/__init__.py                 |  1 +
 src/transformers/models/auto/configuration_auto.py  |  1 +
 src/transformers/models/swinv2/modeling_swinv2.py   |  7 +++++--
 .../test_modeling_vision_encoder_decoder.py         | 13 ++++++++-----
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 11887db91f839..fdf315b2257d8 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -52,6 +52,7 @@
     dialogpt,
     distilbert,
     dit,
+    donut,
     dpr,
     dpt,
     electra,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index c65a2762a0002..0fcc123a31586 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -304,6 +304,7 @@
         ("dialogpt", "DialoGPT"),
         ("distilbert", "DistilBERT"),
         ("dit", "DiT"),
+        ("donut", "Donut"),
         ("dpr", "DPR"),
         ("dpt", "DPT"),
         ("electra", "ELECTRA"),
diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py
index 52f836d5b91d3..26cb4e2398dd9 100644
--- a/src/transformers/models/swinv2/modeling_swinv2.py
+++ b/src/transformers/models/swinv2/modeling_swinv2.py
@@ -1002,7 +1002,9 @@ def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
         self.embeddings = Swinv2Embeddings(config, use_mask_token=use_mask_token)
         self.encoder = Swinv2Encoder(config, self.embeddings.patch_grid)
 
-        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
+        self.layernorm = (
+            nn.LayerNorm(self.num_features, eps=config.layer_norm_eps) if config.add_final_layer_norm else None
+        )
         self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
 
         # Initialize weights and apply final processing
@@ -1065,7 +1067,8 @@ def forward(
         )
 
         sequence_output = encoder_outputs[0]
-        sequence_output = self.layernorm(sequence_output)
+        if self.layernorm is not None:
+            sequence_output = self.layernorm(sequence_output)
 
         pooled_output = None
         if self.pooler is not None:
diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
index a21c6dc967e12..e94c3c2b44df0 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@@ -807,7 +807,7 @@ def test_inference_docvqa(self):
             logits = outputs.logits
 
         # verify the logits
-        expected_shape = torch.Size([1, 3, 57532])
+        expected_shape = torch.Size([1, 1, 57532])
         self.assertEqual(outputs.logits.shape, expected_shape)
 
         expected_slice = torch.tensor([24.2731, -6.4522, 32.4130]).to(torch_device)
@@ -831,6 +831,7 @@ def test_inference_docvqa(self):
             num_beams=1,
             bad_words_ids=[[processor.tokenizer.unk_token_id]],
             output_scores=True,
+            return_dict_in_generate=True,
         )
         sequence = processor.batch_decode(outputs.sequences)[0]
         sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
@@ -838,7 +839,7 @@ def test_inference_docvqa(self):
 
         # verify generated sequence
         self.assertEqual(
-            sequence, ["<s_question> When is the coffee break?</s_question><s_answer> 11-14 to 11:39 a.m.</s_answer>"]
+            sequence, "<s_question> When is the coffee break?</s_question><s_answer> 11-14 to 11:39 a.m.</s_answer>"
         )
 
         # verify scores
@@ -855,7 +856,7 @@ def test_inference_cordv2(self):
         model = VisionEncoderDecoderModel.from_pretrained("nielsr/donut-base-finetuned-cord-v2").to(torch_device)
 
         dataset = load_dataset("hf-internal-testing/example-documents", split="test")
-        image = dataset[1]["image"]
+        image = dataset[2]["image"]
 
         pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
         decoder_input_ids = processor.tokenizer(
@@ -871,7 +872,7 @@ def test_inference_cordv2(self):
         expected_shape = torch.Size((1, 1, model.decoder.config.vocab_size))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([-24.4937, -2.8204, -15.3401], device=torch_device)
+        expected_slice = torch.tensor([-27.4344, -3.2686, -19.3524], device=torch_device)
         self.assertTrue(torch.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))
 
         # step 2: generation
@@ -890,6 +891,7 @@ def test_inference_cordv2(self):
             num_beams=1,
             bad_words_ids=[[processor.tokenizer.unk_token_id]],
             output_scores=True,
+            return_dict_in_generate=True,
         )
 
         sequence = processor.batch_decode(outputs.sequences)[0]
@@ -951,6 +953,7 @@ def test_inference_rvlcdip(self):
             num_beams=1,
             bad_words_ids=[[processor.tokenizer.unk_token_id]],
             output_scores=True,
+            return_dict_in_generate=True,
         )
 
         sequence = processor.batch_decode(outputs.sequences)[0]
@@ -958,7 +961,7 @@ def test_inference_rvlcdip(self):
         sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
 
         # verify generated sequence
-        self.assertEqual(sequence, ["<s_class><advertisement/></s_class>"])
+        self.assertEqual(sequence, "<s_class><advertisement/></s_class>")
 
         # verify scores
         self.assertEqual(len(outputs.scores), 4)

From b088c477d9ba7f15f91474144ace5e606e6bd535 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 4 Aug 2022 17:58:17 +0200
Subject: [PATCH 15/35] Remove register_buffer

---
 src/transformers/models/donut/convert_donut_to_pytorch.py | 6 ++----
 src/transformers/models/swin/modeling_swin.py             | 1 -
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index 57d893a6d5dd1..5d6f542d8ef7f 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -125,7 +125,8 @@ def convert_state_dict(orig_state_dict, model):
                     f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
                 ] = val[-dim:]
         elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]:
-            # TODO check attn_mask buffers
+            # HuggingFace implementation doesn't use attn_mask buffer
+            # and model doesn't use final LayerNorms for the encoder
             pass
         else:
             orig_state_dict[rename_key(key)] = val
@@ -152,9 +153,6 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
     dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
     image = Image.open(dataset["test"][0]["file"]).convert("RGB")
 
-    # TODO maybe verify pixel values against original implementation
-    # original_pixel_values = original_model.encoder.prepare_input(image).unsqueeze(0)
-
     tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
     feature_extractor = DonutFeatureExtractor(
         do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1]
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index 662cdddb8b0fe..b25ca2956c62f 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -667,7 +667,6 @@ def forward(
         hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
         hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
         attn_mask = self.get_attn_mask(height_pad, width_pad)
-        self.register_buffer("attn_mask", attn_mask)
         if attn_mask is not None:
             attn_mask = attn_mask.to(hidden_states_windows.device)
 

From 9efb67c51a1f3168cd1bfc05a294ad692413eb0d Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 5 Aug 2022 15:44:05 +0200
Subject: [PATCH 16/35] Fix toctree and add missing attribute

---
 docs/source/en/_toctree.yml                            | 2 ++
 src/transformers/models/swinv2/configuration_swinv2.py | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 32ab4c6361d3a..2f8bce71ff220 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -218,6 +218,8 @@
         title: DialoGPT
       - local: model_doc/distilbert
         title: DistilBERT
+      - local: model_doc/donut
+        title: Donut
       - local: model_doc/dpr
         title: DPR
       - local: model_doc/electra
diff --git a/src/transformers/models/swinv2/configuration_swinv2.py b/src/transformers/models/swinv2/configuration_swinv2.py
index f861be05fe1f8..41287865010e1 100644
--- a/src/transformers/models/swinv2/configuration_swinv2.py
+++ b/src/transformers/models/swinv2/configuration_swinv2.py
@@ -74,6 +74,8 @@ class Swinv2Config(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        add_final_layer_norm (`bool`, *optional*, defaults to True):
+            Whether or not to add a final layer normalization layer after the stages.
         encoder_stride (`int`, `optional`, defaults to 32):
             Factor to increase the spatial resolution by in the decoder head for masked image modeling.
 
@@ -117,6 +119,7 @@ def __init__(
         patch_norm=True,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
+        add_final_layer_norm=True,
         encoder_stride=32,
         **kwargs
     ):
@@ -140,6 +143,7 @@ def __init__(
         self.path_norm = patch_norm
         self.layer_norm_eps = layer_norm_eps
         self.initializer_range = initializer_range
+        self.add_final_layer_norm = add_final_layer_norm
         self.encoder_stride = encoder_stride
         # we set the hidden_size attribute in order to make Swinv2 work with VisionEncoderDecoderModel
         # this indicates the channel dimension after the last stage of the model

From ee09b625b818885854601541dae75600459ee62f Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sat, 6 Aug 2022 11:18:43 +0200
Subject: [PATCH 17/35] Add DonutSwin

---
 docs/source/en/index.mdx                      |   1 +
 docs/source/en/model_doc/donut.mdx            |   8 +
 src/transformers/__init__.py                  |  12 +-
 .../models/auto/configuration_auto.py         |   1 +
 src/transformers/models/auto/modeling_auto.py |   1 +
 src/transformers/models/donut/__init__.py     |  32 +-
 .../models/donut/configuration_donut_swin.py  | 140 +++
 .../models/donut/modeling_donut_swin.py       | 938 ++++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |  17 +
 src/transformers/utils/fx.py                  |   1 +
 tests/models/donut/__init__.py                |   0
 .../models/donut/test_modeling_donut_swin.py  | 491 +++++++++
 12 files changed, 1638 insertions(+), 4 deletions(-)
 create mode 100644 src/transformers/models/donut/configuration_donut_swin.py
 create mode 100644 src/transformers/models/donut/modeling_donut_swin.py
 create mode 100644 tests/models/donut/__init__.py
 create mode 100644 tests/models/donut/test_modeling_donut_swin.py

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 2f5ffb67eab53..3c722949fcdf9 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -225,6 +225,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            Donut            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
diff --git a/docs/source/en/model_doc/donut.mdx b/docs/source/en/model_doc/donut.mdx
index 8af288873bddd..ae8ba351308af 100644
--- a/docs/source/en/model_doc/donut.mdx
+++ b/docs/source/en/model_doc/donut.mdx
@@ -94,6 +94,9 @@ into a single instance to both extract the input features and decode the predict
 
 See the [model hub](https://huggingface.co/models?filter=donut) to look for Donut checkpoints.
 
+## DonutSwinConfig
+
+[[autodoc]] DonutSwinConfig
 
 ## DonutFeatureExtractor
 
@@ -109,3 +112,8 @@ See the [model hub](https://huggingface.co/models?filter=donut) to look for Donu
     - batch_decode
     - decode
     - as_target_processor
+
+## DonutSwinModel
+
+[[autodoc]] DonutSwinModel
+    - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 51e640e180fd2..d6444e0844ff5 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -190,7 +190,7 @@
     "models.dialogpt": [],
     "models.distilbert": ["DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DistilBertConfig", "DistilBertTokenizer"],
     "models.dit": [],
-    "models.donut": ["DonutProcessor"],
+    "models.donut": ["DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "DonutProcessor", "DonutSwinConfig"],
     "models.dpr": [
         "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "DPRConfig",
@@ -1101,6 +1101,13 @@
             "DistilBertPreTrainedModel",
         ]
     )
+    _import_structure["models.donut"].extend(
+        [
+            "DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "DonutSwinModel",
+            "DonutSwinPreTrainedModel",
+        ]
+    )
     _import_structure["models.dpr"].extend(
         [
             "DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2986,7 +2993,7 @@
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
     from .models.detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig
     from .models.distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, DistilBertTokenizer
-    from .models.donut import DonutProcessor
+    from .models.donut import DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, DonutProcessor, DonutSwinConfig
     from .models.dpr import (
         DPR_PRETRAINED_CONFIG_ARCHIVE_MAP,
         DPRConfig,
@@ -3765,6 +3772,7 @@
             DistilBertModel,
             DistilBertPreTrainedModel,
         )
+        from .models.donut import DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST, DonutSwinModel, DonutSwinPreTrainedModel
         from .models.dpr import (
             DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
             DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 0fcc123a31586..6e3b47f4cafa6 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -56,6 +56,7 @@
         ("deit", "DeiTConfig"),
         ("detr", "DetrConfig"),
         ("distilbert", "DistilBertConfig"),
+        ("donut", "DonutSwinConfig"),
         ("dpr", "DPRConfig"),
         ("dpt", "DPTConfig"),
         ("electra", "ElectraConfig"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index bd4774c245b07..bfaa099beb134 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -56,6 +56,7 @@
         ("deit", "DeiTModel"),
         ("detr", "DetrModel"),
         ("distilbert", "DistilBertModel"),
+        ("donut", "DonutSwinModel"),
         ("dpr", "DPRQuestionEncoder"),
         ("dpt", "DPTModel"),
         ("electra", "ElectraModel"),
diff --git a/src/transformers/models/donut/__init__.py b/src/transformers/models/donut/__init__.py
index 057dd49fff53f..a01f6b11a9a99 100644
--- a/src/transformers/models/donut/__init__.py
+++ b/src/transformers/models/donut/__init__.py
@@ -17,10 +17,25 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_vision_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
-_import_structure = {"processing_donut": ["DonutProcessor"]}
+_import_structure = {
+    "configuration_donut_swin": ["DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "DonutSwinConfig"],
+    "processing_donut": ["DonutProcessor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_donut_swin"] = [
+        "DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "DonutSwinModel",
+        "DonutSwinPreTrainedModel",
+    ]
 
 try:
     if not is_vision_available():
@@ -32,8 +47,21 @@
 
 
 if TYPE_CHECKING:
+    from .configuration_donut_swin import DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, DonutSwinConfig
     from .processing_donut import DonutProcessor
 
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_donut_swin import (
+            DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DonutSwinModel,
+            DonutSwinPreTrainedModel,
+        )
+
     try:
         if not is_vision_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/donut/configuration_donut_swin.py b/src/transformers/models/donut/configuration_donut_swin.py
new file mode 100644
index 0000000000000..a7c7b70653ca1
--- /dev/null
+++ b/src/transformers/models/donut/configuration_donut_swin.py
@@ -0,0 +1,140 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Donut Swin Transformer model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "naver-clova-ix/donut-base": "https://huggingface.co/naver-clova-ix/donut-base/resolve/main/config.json",
+    # See all Donut models at https://huggingface.co/models?filter=donut-swin
+}
+
+
+class DonutSwinConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DonutSwinModel`]. It is used to instantiate a
+    Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Donut
+    [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embed_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of patch embedding.
+        depths (`list(int)`, *optional*, defaults to [2, 2, 6, 2]):
+            Depth of each layer in the Transformer encoder.
+        num_heads (`list(int)`, *optional*, defaults to [3, 6, 12, 24]):
+            Number of attention heads in each layer of the Transformer encoder.
+        window_size (`int`, *optional*, defaults to 7):
+            Size of windows.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        qkv_bias (`bool`, *optional*, defaults to True):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        use_absolute_embeddings (`bool`, *optional*, defaults to False):
+            Whether or not to add absolute position embeddings to the patch embeddings.
+        patch_norm (`bool`, *optional*, defaults to True):
+            Whether or not to add layer normalization after patch embedding.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+
+        Example:
+
+    ```python
+    >>> from transformers import DonutSwinConfig, DonutSwinModel
+
+    >>> # Initializing a Donut naver-clova-ix/donut-base style configuration
+    >>> configuration = DonutConfig()
+
+    >>> # Initializing a model from the naver-clova-ix/donut-base style configuration
+    >>> model = DonutSwinModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "donut_swin"
+
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        image_size=224,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        use_absolute_embeddings=False,
+        patch_norm=True,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.use_absolute_embeddings = use_absolute_embeddings
+        self.path_norm = patch_norm
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py
new file mode 100644
index 0000000000000..b619e0f6a22f3
--- /dev/null
+++ b/src/transformers/models/donut/modeling_donut_swin.py
@@ -0,0 +1,938 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Donut Swin Transformer model."""
+
+import collections.abc
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_donut_swin import DonutSwinConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "DonutSwinConfig"
+_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "https://huggingface.co/naver-clova-ix/donut-base"
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 768]
+
+DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "naver-clova-ix/donut-base",
+    # See all Donut Swin models at https://huggingface.co/models?filter=donut
+]
+
+
+@dataclass
+# Copied from transformers.models.swin.modeling_swin.SwinEncoderOutput with Swin->DonutSwin
+class DonutSwinEncoderOutput(ModelOutput):
+    """
+    DonutSwin encoder's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.swin.modeling_swin.SwinModelOutput with Swin->DonutSwin
+class DonutSwinModelOutput(ModelOutput):
+    """
+    DonutSwin model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
+            Average pooling of the last layer hidden-state.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    pooler_output: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+# Copied from transformers.models.swin.modeling_swin.window_partition
+def window_partition(input_feature, window_size):
+    """
+    Partitions the given input into windows.
+    """
+    batch_size, height, width, num_channels = input_feature.shape
+    input_feature = input_feature.view(
+        batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
+    )
+    windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
+    return windows
+
+
+# Copied from transformers.models.swin.modeling_swin.window_reverse
+def window_reverse(windows, window_size, height, width):
+    """
+    Merges windows to produce higher resolution features.
+    """
+    batch_size = math.floor(windows.shape[0] / (height * width / window_size / window_size))
+    windows = windows.view(batch_size, height // window_size, width // window_size, window_size, window_size, -1)
+    windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(batch_size, height, width, -1)
+    return windows
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinEmbeddings with Swin->DonutSwin
+class DonutSwinEmbeddings(nn.Module):
+    """
+    Construct the patch and position embeddings. Optionally, also the mask token.
+    """
+
+    def __init__(self, config, use_mask_token=False):
+        super().__init__()
+
+        self.patch_embeddings = DonutSwinPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.patch_grid = self.patch_embeddings.grid_size
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None
+
+        if config.use_absolute_embeddings:
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))
+        else:
+            self.position_embeddings = None
+
+        self.norm = nn.LayerNorm(config.embed_dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None
+    ) -> Tuple[torch.Tensor]:
+        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
+        embeddings = self.norm(embeddings)
+        batch_size, seq_len, _ = embeddings.size()
+
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        if self.position_embeddings is not None:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings, output_dimensions
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinPatchEmbeddings
+class DonutSwinPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.embed_dim
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def maybe_pad(self, pixel_values, height, width):
+        if width % self.patch_size[1] != 0:
+            pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        if height % self.patch_size[0] != 0:
+            pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        return pixel_values
+
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
+        _, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        # pad the input to be divisible by self.patch_size, if needed
+        pixel_values = self.maybe_pad(pixel_values, height, width)
+        embeddings = self.projection(pixel_values)
+        _, _, height, width = embeddings.shape
+        output_dimensions = (height, width)
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+
+        return embeddings, output_dimensions
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging
+class DonutSwinPatchMerging(nn.Module):
+    """
+    Patch Merging Layer.
+
+    Args:
+        input_resolution (`Tuple[int]`):
+            Resolution of input feature.
+        dim (`int`):
+            Number of input channels.
+        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer class.
+    """
+
+    def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def maybe_pad(self, input_feature, height, width):
+        should_pad = (height % 2 == 1) or (width % 2 == 1)
+        if should_pad:
+            pad_values = (0, 0, 0, width % 2, 0, height % 2)
+            input_feature = nn.functional.pad(input_feature, pad_values)
+
+        return input_feature
+
+    def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
+        height, width = input_dimensions
+        # `dim` is height * width
+        batch_size, dim, num_channels = input_feature.shape
+
+        input_feature = input_feature.view(batch_size, height, width, num_channels)
+        # pad input to be disible by width and height, if needed
+        input_feature = self.maybe_pad(input_feature, height, width)
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_0 = input_feature[:, 0::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_1 = input_feature[:, 1::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_2 = input_feature[:, 0::2, 1::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_3 = input_feature[:, 1::2, 1::2, :]
+        # batch_size height/2 width/2 4*num_channels
+        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
+        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)  # batch_size height/2*width/2 4*C
+
+        input_feature = self.norm(input_feature)
+        input_feature = self.reduction(input_feature)
+
+        return input_feature
+
+
+# Copied from transformers.models.swin.modeling_swin.drop_path
+def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinDropPath
+class DonutSwinDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinSelfAttention with Swin->DonutSwin
+class DonutSwinSelfAttention(nn.Module):
+    def __init__(self, config, dim, num_heads):
+        super().__init__()
+        if dim % num_heads != 0:
+            raise ValueError(
+                f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
+            )
+
+        self.num_attention_heads = num_heads
+        self.attention_head_size = int(dim / num_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        window_size = config.window_size
+        self.window_size = (
+            window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
+        )
+
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
+        )
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        batch_size, dim, num_channels = hidden_states.shape
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)]
+        relative_position_bias = relative_position_bias.view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )
+
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        attention_scores = attention_scores + relative_position_bias.unsqueeze(0)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in DonutSwinModel forward() function)
+            mask_shape = attention_mask.shape[0]
+            attention_scores = attention_scores.view(
+                batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
+            )
+            attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0)
+            attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput
+class DonutSwinSelfOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->DonutSwin
+class DonutSwinAttention(nn.Module):
+    def __init__(self, config, dim, num_heads):
+        super().__init__()
+        self.self = DonutSwinSelfAttention(config, dim, num_heads)
+        self.output = DonutSwinSelfOutput(config, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinIntermediate
+class DonutSwinIntermediate(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinOutput
+class DonutSwinOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->DonutSwin
+class DonutSwinLayer(nn.Module):
+    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.shift_size = shift_size
+        self.window_size = config.window_size
+        self.input_resolution = input_resolution
+        self.set_shift_and_window_size(input_resolution)
+        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.attention = DonutSwinAttention(config, dim, num_heads)
+        self.drop_path = DonutSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.intermediate = DonutSwinIntermediate(config, dim)
+        self.output = DonutSwinOutput(config, dim)
+
+    def set_shift_and_window_size(self, input_resolution):
+        if min(input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(input_resolution)
+
+    def get_attn_mask(self, height, width):
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, height, width, 1))
+            height_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            width_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            count = 0
+            for height_slice in height_slices:
+                for width_slice in width_slices:
+                    img_mask[:, height_slice, width_slice, :] = count
+                    count += 1
+
+            mask_windows = window_partition(img_mask, self.window_size)
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+        return attn_mask
+
+    def maybe_pad(self, hidden_states, height, width):
+        pad_right = (self.window_size - width % self.window_size) % self.window_size
+        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
+        pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
+        hidden_states = nn.functional.pad(hidden_states, pad_values)
+        return hidden_states, pad_values
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        self.set_shift_and_window_size(input_dimensions)
+        height, width = input_dimensions
+        batch_size, _, channels = hidden_states.size()
+        shortcut = hidden_states
+
+        hidden_states = self.layernorm_before(hidden_states)
+        hidden_states = hidden_states.view(batch_size, height, width, channels)
+        # pad hidden_states to multiples of window size
+        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
+
+        _, height_pad, width_pad, _ = hidden_states.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_hidden_states = hidden_states
+
+        # partition windows
+        hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
+        hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
+        attn_mask = self.get_attn_mask(height_pad, width_pad)
+        if attn_mask is not None:
+            attn_mask = attn_mask.to(hidden_states_windows.device)
+
+        attention_outputs = self.attention(
+            hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
+        )
+
+        attention_output = attention_outputs[0]
+
+        attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)
+        shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            attention_windows = shifted_windows
+
+        was_padded = pad_values[3] > 0 or pad_values[5] > 0
+        if was_padded:
+            attention_windows = attention_windows[:, :height, :width, :].contiguous()
+
+        attention_windows = attention_windows.view(batch_size, height * width, channels)
+
+        hidden_states = shortcut + self.drop_path(attention_windows)
+
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+        layer_output = hidden_states + self.output(layer_output)
+
+        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
+        return layer_outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->DonutSwin
+class DonutSwinStage(nn.Module):
+    def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
+        super().__init__()
+        self.config = config
+        self.dim = dim
+        self.blocks = nn.ModuleList(
+            [
+                DonutSwinLayer(
+                    config=config,
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    shift_size=0 if (i % 2 == 0) else config.window_size // 2,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
+        else:
+            self.downsample = None
+
+        self.pointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        height, width = input_dimensions
+        for i, layer_module in enumerate(self.blocks):
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(hidden_states, input_dimensions, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+        if self.downsample is not None:
+            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
+            output_dimensions = (height, width, height_downsampled, width_downsampled)
+            hidden_states = self.downsample(layer_outputs[0], input_dimensions)
+        else:
+            output_dimensions = (height, width, height, width)
+
+        stage_outputs = (hidden_states, output_dimensions)
+
+        if output_attentions:
+            stage_outputs += layer_outputs[1:]
+        return stage_outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinEncoder with Swin->DonutSwin
+class DonutSwinEncoder(nn.Module):
+    def __init__(self, config, grid_size):
+        super().__init__()
+        self.num_layers = len(config.depths)
+        self.config = config
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+        self.layers = nn.ModuleList(
+            [
+                DonutSwinStage(
+                    config=config,
+                    dim=int(config.embed_dim * 2**i_layer),
+                    input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
+                    depth=config.depths[i_layer],
+                    num_heads=config.num_heads[i_layer],
+                    drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                    downsample=DonutSwinPatchMerging if (i_layer < self.num_layers - 1) else None,
+                )
+                for i_layer in range(self.num_layers)
+            ]
+        )
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, DonutSwinEncoderOutput]:
+        all_input_dimensions = ()
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            batch_size, _, hidden_size = hidden_states.shape
+            # rearrange b (h w) c -> b c h w
+            reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+            reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+            all_hidden_states += (hidden_states,)
+            all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        for i, layer_module in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, input_dimensions, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+            output_dimensions = layer_outputs[1]
+
+            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+            all_input_dimensions += (input_dimensions,)
+
+            if output_hidden_states:
+                batch_size, _, hidden_size = hidden_states.shape
+                # rearrange b (h w) c -> b c h w
+                reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+            if output_attentions:
+                all_self_attentions += layer_outputs[2:]
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+
+        return DonutSwinEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            reshaped_hidden_states=all_reshaped_hidden_states,
+        )
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinPreTrainedModel with Swin->DonutSwin
+class DonutSwinPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DonutSwinConfig
+    base_model_prefix = "swin"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, DonutSwinEncoder):
+            module.gradient_checkpointing = value
+
+
+SWIN_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`DonutSwinConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+SWIN_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
+            [`AutoFeatureExtractor.__call__`] for details.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Donut Swin Model transformer outputting raw hidden-states without any specific head on top.",
+    SWIN_START_DOCSTRING,
+)
+class DonutSwinModel(DonutSwinPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
+        super().__init__(config)
+        self.config = config
+        self.num_layers = len(config.depths)
+        self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))
+
+        self.embeddings = DonutSwinEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = DonutSwinEncoder(config, self.embeddings.patch_grid)
+
+        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=DonutSwinModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, DonutSwinModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, len(self.config.depths))
+
+        embedding_output, input_dimensions = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            input_dimensions,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+
+        pooled_output = None
+        if self.pooler is not None:
+            pooled_output = self.pooler(sequence_output.transpose(1, 2))
+            pooled_output = torch.flatten(pooled_output, 1)
+
+        if not return_dict:
+            output = (sequence_output, pooled_output) + encoder_outputs[1:]
+
+            return output
+
+        return DonutSwinModelOutput(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index d636be655af28..96a93ecae942a 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1682,6 +1682,23 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class DonutSwinModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DonutSwinPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index 990f278b0d506..066faec9b58e7 100644
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -105,6 +105,7 @@ def _generate_supported_model_class_names(
     "deberta",
     "deberta-v2",
     "distilbert",
+    "donut",
     "electra",
     "gpt2",
     "gpt_neo",
diff --git a/tests/models/donut/__init__.py b/tests/models/donut/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/donut/test_modeling_donut_swin.py b/tests/models/donut/test_modeling_donut_swin.py
new file mode 100644
index 0000000000000..c649792cac544
--- /dev/null
+++ b/tests/models/donut/test_modeling_donut_swin.py
@@ -0,0 +1,491 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Donut Swin model. """
+
+import collections
+import inspect
+import os
+import pickle
+import tempfile
+import unittest
+
+from transformers import DonutSwinConfig
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_torch_fx_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import DonutSwinModel
+    from transformers.models.donut.modeling_donut_swin import DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoFeatureExtractor
+
+if is_torch_fx_available():
+    from transformers.utils.fx import symbolic_trace
+
+
+class DonutSwinModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=32,
+        patch_size=2,
+        num_channels=3,
+        embed_dim=16,
+        depths=[1, 2, 1],
+        num_heads=[2, 2, 4],
+        window_size=2,
+        mlp_ratio=2.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        use_absolute_embeddings=False,
+        patch_norm=True,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        is_training=True,
+        scope=None,
+        use_labels=True,
+        type_sequence_label_size=10,
+        encoder_stride=8,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.use_absolute_embeddings = use_absolute_embeddings
+        self.patch_norm = patch_norm
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.is_training = is_training
+        self.scope = scope
+        self.use_labels = use_labels
+        self.type_sequence_label_size = type_sequence_label_size
+        self.encoder_stride = encoder_stride
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return DonutSwinConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            embed_dim=self.embed_dim,
+            depths=self.depths,
+            num_heads=self.num_heads,
+            window_size=self.window_size,
+            mlp_ratio=self.mlp_ratio,
+            qkv_bias=self.qkv_bias,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            drop_path_rate=self.drop_path_rate,
+            hidden_act=self.hidden_act,
+            use_absolute_embeddings=self.use_absolute_embeddings,
+            path_norm=self.patch_norm,
+            layer_norm_eps=self.layer_norm_eps,
+            initializer_range=self.initializer_range,
+            encoder_stride=self.encoder_stride,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = DonutSwinModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        expected_seq_len = ((config.image_size // config.patch_size) ** 2) // (4 ** (len(config.depths) - 1))
+        expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class DonutSwinModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            DonutSwinModel,
+        )
+        if is_torch_available()
+        else ()
+    )
+    fx_compatible = True
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = DonutSwinModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DonutSwinConfig, embed_dim=37)
+
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def create_and_test_config_common_properties(self):
+        return
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_inputs_embeds(self):
+        # DonutSwin does not use inputs_embeds
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            expected_num_attentions = len(self.model_tester.depths)
+            self.assertEqual(len(attentions), expected_num_attentions)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            window_size_squared = config.window_size**2
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), expected_num_attentions)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_heads[0], window_size_squared, window_size_squared],
+            )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            else:
+                # also another +1 for reshaped_hidden_states
+                added_hidden_states = 2
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.attentions
+
+            self.assertEqual(len(self_attentions), expected_num_attentions)
+
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_heads[0], window_size_squared, window_size_squared],
+            )
+
+    def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+        hidden_states = outputs.hidden_states
+
+        expected_num_layers = getattr(
+            self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
+        )
+        self.assertEqual(len(hidden_states), expected_num_layers)
+
+        # DonutSwin has a different seq_length
+        patch_size = (
+            config.patch_size
+            if isinstance(config.patch_size, collections.abc.Iterable)
+            else (config.patch_size, config.patch_size)
+        )
+
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+
+        self.assertListEqual(
+            list(hidden_states[0].shape[-2:]),
+            [num_patches, self.model_tester.embed_dim],
+        )
+
+        reshaped_hidden_states = outputs.reshaped_hidden_states
+        self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
+
+        batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
+        reshaped_hidden_states = (
+            reshaped_hidden_states[0].view(batch_size, num_channels, height * width).permute(0, 2, 1)
+        )
+        self.assertListEqual(
+            list(reshaped_hidden_states.shape[-2:]),
+            [num_patches, self.model_tester.embed_dim],
+        )
+
+    def test_hidden_states_output(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        image_size = (
+            self.model_tester.image_size
+            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
+            else (self.model_tester.image_size, self.model_tester.image_size)
+        )
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
+
+    def test_hidden_states_output_with_padding(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.patch_size = 3
+
+        image_size = (
+            self.model_tester.image_size
+            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
+            else (self.model_tester.image_size, self.model_tester.image_size)
+        )
+        patch_size = (
+            config.patch_size
+            if isinstance(config.patch_size, collections.abc.Iterable)
+            else (config.patch_size, config.patch_size)
+        )
+
+        padded_height = image_size[0] + patch_size[0] - (image_size[0] % patch_size[0])
+        padded_width = image_size[1] + patch_size[1] - (image_size[1] % patch_size[1])
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = DonutSwinModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if "embeddings" not in name and param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
+        if not is_torch_fx_available() or not self.fx_compatible:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.return_dict = False
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
+
+            try:
+                if model.config.is_encoder_decoder:
+                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
+                    labels = inputs.get("labels", None)
+                    input_names = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"]
+                    if labels is not None:
+                        input_names.append("labels")
+
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+                    input_names = list(filtered_inputs.keys())
+
+                    model_output = model(**filtered_inputs)
+
+                    traced_model = symbolic_trace(model, input_names)
+                    traced_output = traced_model(**filtered_inputs)
+                else:
+                    input_names = ["input_ids", "attention_mask", "token_type_ids", "pixel_values"]
+
+                    labels = inputs.get("labels", None)
+                    start_positions = inputs.get("start_positions", None)
+                    end_positions = inputs.get("end_positions", None)
+                    if labels is not None:
+                        input_names.append("labels")
+                    if start_positions is not None:
+                        input_names.append("start_positions")
+                    if end_positions is not None:
+                        input_names.append("end_positions")
+
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+                    input_names = list(filtered_inputs.keys())
+
+                    model_output = model(**filtered_inputs)
+
+                    traced_model = symbolic_trace(model, input_names)
+                    traced_output = traced_model(**filtered_inputs)
+
+            except RuntimeError as e:
+                self.fail(f"Couldn't trace module: {e}")
+
+            def flatten_output(output):
+                flatten = []
+                for x in output:
+                    if isinstance(x, (tuple, list)):
+                        flatten += flatten_output(x)
+                    elif not isinstance(x, torch.Tensor):
+                        continue
+                    else:
+                        flatten.append(x)
+                return flatten
+
+            model_output = flatten_output(model_output)
+            traced_output = flatten_output(traced_output)
+            num_outputs = len(model_output)
+
+            for i in range(num_outputs):
+                self.assertTrue(
+                    torch.allclose(model_output[i], traced_output[i]),
+                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
+                )
+
+            # Test that the model can be serialized and restored properly
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pkl_file_name = os.path.join(tmp_dir_name, "model.pkl")
+                try:
+                    with open(pkl_file_name, "wb") as f:
+                        pickle.dump(traced_model, f)
+                    with open(pkl_file_name, "rb") as f:
+                        loaded = pickle.load(f)
+                except Exception as e:
+                    self.fail(f"Couldn't serialize / deserialize the traced model: {e}")
+
+                loaded_output = loaded(**filtered_inputs)
+                loaded_output = flatten_output(loaded_output)
+
+                for i in range(num_outputs):
+                    self.assertTrue(
+                        torch.allclose(model_output[i], loaded_output[i]),
+                        f"serialized model {i}th output doesn't match model {i}th output for {model_class}",
+                    )
+
+
+@require_vision
+@require_torch
+class DonutSwinModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return (
+            AutoFeatureExtractor.from_pretrained("naver-clova-ix/donut-base")
+            if is_vision_available()
+            else None
+        )
+
+    @slow
+    def test_inference_image_classification_head(self):
+        raise NotImplementedError("To do")
\ No newline at end of file

From 436dd4126b52b378e3031f08295ca08412441705 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sat, 6 Aug 2022 11:41:00 +0200
Subject: [PATCH 18/35] Make conversion script work

---
 .../models/donut/configuration_donut_swin.py   |  2 +-
 .../models/donut/convert_donut_to_pytorch.py   |  9 ++++-----
 .../models/donut/modeling_donut_swin.py        |  5 ++++-
 .../models/swin/configuration_swin.py          |  4 ----
 .../models/swinv2/configuration_swinv2.py      |  4 ----
 tests/models/donut/test_modeling_donut_swin.py | 18 +++---------------
 6 files changed, 12 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/donut/configuration_donut_swin.py b/src/transformers/models/donut/configuration_donut_swin.py
index a7c7b70653ca1..e24abc21c00fe 100644
--- a/src/transformers/models/donut/configuration_donut_swin.py
+++ b/src/transformers/models/donut/configuration_donut_swin.py
@@ -87,7 +87,7 @@ class DonutSwinConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "donut_swin"
+    model_type = "donut"
 
     attribute_map = {
         "num_attention_heads": "num_heads",
diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index 5d6f542d8ef7f..ffa7bfebdf210 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -24,10 +24,10 @@
 from transformers import (
     DonutFeatureExtractor,
     DonutProcessor,
+    DonutSwinConfig,
+    DonutSwinModel,
     MBartConfig,
     MBartForCausalLM,
-    SwinConfig,
-    SwinModel,
     VisionEncoderDecoderModel,
     XLMRobertaTokenizerFast,
 )
@@ -36,14 +36,13 @@
 def get_configs(model):
     original_config = model.config
 
-    encoder_config = SwinConfig(
+    encoder_config = DonutSwinConfig(
         image_size=original_config.input_size,
         patch_size=4,
         depths=original_config.encoder_layer,
         num_heads=[4, 8, 16, 32],
         window_size=original_config.window_size,
         embed_dim=128,
-        add_final_layer_norm=False,
     )
     decoder_config = MBartConfig(
         is_decoder=True,
@@ -140,7 +139,7 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
 
     # load HuggingFace model
     encoder_config, decoder_config = get_configs(original_model)
-    encoder = SwinModel(encoder_config)
+    encoder = DonutSwinModel(encoder_config)
     decoder = MBartForCausalLM(decoder_config)
     model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
     model.eval()
diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py
index b619e0f6a22f3..78e5cc81c1988 100644
--- a/src/transformers/models/donut/modeling_donut_swin.py
+++ b/src/transformers/models/donut/modeling_donut_swin.py
@@ -12,7 +12,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Donut Swin Transformer model."""
+""" PyTorch Donut Swin Transformer model.
+
+This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden
+states."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/swin/configuration_swin.py b/src/transformers/models/swin/configuration_swin.py
index 5b3410ae8ce3f..878a73e9208b5 100644
--- a/src/transformers/models/swin/configuration_swin.py
+++ b/src/transformers/models/swin/configuration_swin.py
@@ -75,8 +75,6 @@ class SwinConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        add_final_layer_norm (`bool`, *optional*, defaults to True):
-            Whether or not to add a final layer normalization layer after the stages.
         encoder_stride (`int`, `optional`, defaults to 32):
             Factor to increase the spatial resolution by in the decoder head for masked image modeling.
 
@@ -120,7 +118,6 @@ def __init__(
         patch_norm=True,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
-        add_final_layer_norm=True,
         encoder_stride=32,
         **kwargs
     ):
@@ -144,7 +141,6 @@ def __init__(
         self.path_norm = patch_norm
         self.layer_norm_eps = layer_norm_eps
         self.initializer_range = initializer_range
-        self.add_final_layer_norm = add_final_layer_norm
         self.encoder_stride = encoder_stride
         # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
         # this indicates the channel dimension after the last stage of the model
diff --git a/src/transformers/models/swinv2/configuration_swinv2.py b/src/transformers/models/swinv2/configuration_swinv2.py
index 41287865010e1..f861be05fe1f8 100644
--- a/src/transformers/models/swinv2/configuration_swinv2.py
+++ b/src/transformers/models/swinv2/configuration_swinv2.py
@@ -74,8 +74,6 @@ class Swinv2Config(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        add_final_layer_norm (`bool`, *optional*, defaults to True):
-            Whether or not to add a final layer normalization layer after the stages.
         encoder_stride (`int`, `optional`, defaults to 32):
             Factor to increase the spatial resolution by in the decoder head for masked image modeling.
 
@@ -119,7 +117,6 @@ def __init__(
         patch_norm=True,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
-        add_final_layer_norm=True,
         encoder_stride=32,
         **kwargs
     ):
@@ -143,7 +140,6 @@ def __init__(
         self.path_norm = patch_norm
         self.layer_norm_eps = layer_norm_eps
         self.initializer_range = initializer_range
-        self.add_final_layer_norm = add_final_layer_norm
         self.encoder_stride = encoder_stride
         # we set the hidden_size attribute in order to make Swinv2 work with VisionEncoderDecoderModel
         # this indicates the channel dimension after the last stage of the model
diff --git a/tests/models/donut/test_modeling_donut_swin.py b/tests/models/donut/test_modeling_donut_swin.py
index c649792cac544..f6772ac233d7e 100644
--- a/tests/models/donut/test_modeling_donut_swin.py
+++ b/tests/models/donut/test_modeling_donut_swin.py
@@ -37,8 +37,6 @@
     from transformers.models.donut.modeling_donut_swin import DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST
 
 if is_vision_available():
-    from PIL import Image
-
     from transformers import AutoFeatureExtractor
 
 if is_torch_fx_available():
@@ -156,13 +154,7 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class DonutSwinModelTest(ModelTesterMixin, unittest.TestCase):
 
-    all_model_classes = (
-        (
-            DonutSwinModel,
-        )
-        if is_torch_available()
-        else ()
-    )
+    all_model_classes = (DonutSwinModel,) if is_torch_available() else ()
     fx_compatible = True
 
     test_pruning = False
@@ -480,12 +472,8 @@ def flatten_output(output):
 class DonutSwinModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
-        return (
-            AutoFeatureExtractor.from_pretrained("naver-clova-ix/donut-base")
-            if is_vision_available()
-            else None
-        )
+        return AutoFeatureExtractor.from_pretrained("naver-clova-ix/donut-base") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
-        raise NotImplementedError("To do")
\ No newline at end of file
+        raise NotImplementedError("To do")

From 6dd1cac64e327d9b930a9d8d1645ec372e77683a Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 7 Aug 2022 16:44:26 +0200
Subject: [PATCH 19/35] Improve conversion script

---
 .../models/donut/configuration_donut_swin.py             | 6 +++---
 .../models/donut/convert_donut_to_pytorch.py             | 9 ++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/donut/configuration_donut_swin.py b/src/transformers/models/donut/configuration_donut_swin.py
index e24abc21c00fe..7caef26cdd692 100644
--- a/src/transformers/models/donut/configuration_donut_swin.py
+++ b/src/transformers/models/donut/configuration_donut_swin.py
@@ -73,15 +73,15 @@ class DonutSwinConfig(PretrainedConfig):
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
 
-        Example:
+    Example:
 
     ```python
     >>> from transformers import DonutSwinConfig, DonutSwinModel
 
     >>> # Initializing a Donut naver-clova-ix/donut-base style configuration
-    >>> configuration = DonutConfig()
+    >>> configuration = DonutSwinConfig()
 
-    >>> # Initializing a model from the naver-clova-ix/donut-base style configuration
+    >>> # Randomly initializing a model from the naver-clova-ix/donut-base style configuration
     >>> model = DonutSwinModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index ffa7bfebdf210..c3eabc83135e7 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -18,7 +18,6 @@
 
 import torch
 from datasets import load_dataset
-from PIL import Image
 
 from donut import DonutModel
 from transformers import (
@@ -149,8 +148,8 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
     model.load_state_dict(new_state_dict)
 
     # verify results on scanned document
-    dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
-    image = Image.open(dataset["test"][0]["file"]).convert("RGB")
+    dataset = load_dataset("hf-internal-testing/example-documents")
+    image = dataset["test"][0]["image"].convert("RGB")
 
     tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
     feature_extractor = DonutFeatureExtractor(
@@ -197,8 +196,8 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
         processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        model.push_to_hub(model_name.split("/")[-1], organization="nielsr")
-        processor.push_to_hub(model_name.split("/")[-1], organization="nielsr")
+        model.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model")
+        processor.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model")
 
 
 if __name__ == "__main__":

From b2822c4164ac7e83dff3c4b2f00111c7393da19b Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 7 Aug 2022 17:06:50 +0200
Subject: [PATCH 20/35] Address comment

---
 src/transformers/models/donut/feature_extraction_donut.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/transformers/models/donut/feature_extraction_donut.py b/src/transformers/models/donut/feature_extraction_donut.py
index 6bcd0c2f25b62..6210fb390bcde 100644
--- a/src/transformers/models/donut/feature_extraction_donut.py
+++ b/src/transformers/models/donut/feature_extraction_donut.py
@@ -116,12 +116,7 @@ def pad(self, image: Image.Image, size: Tuple[int, int], random_padding: bool =
             pad_width = delta_width // 2
             pad_height = delta_height // 2
 
-        padding = (
-            pad_width,
-            pad_height,
-            delta_width - pad_width,
-            delta_height - pad_height,
-        )
+        padding = (pad_width, pad_height, delta_width - pad_width, delta_height - pad_height)
         return ImageOps.expand(image, padding)
 
     def __call__(

From 0a674d2801c2b49c57208d1d44cbe3ef258fa080 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 7 Aug 2022 17:12:52 +0200
Subject: [PATCH 21/35] Fix bug

---
 src/transformers/models/donut/configuration_donut_swin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/donut/configuration_donut_swin.py b/src/transformers/models/donut/configuration_donut_swin.py
index 7caef26cdd692..5c65f8a896fc2 100644
--- a/src/transformers/models/donut/configuration_donut_swin.py
+++ b/src/transformers/models/donut/configuration_donut_swin.py
@@ -20,7 +20,7 @@
 
 logger = logging.get_logger(__name__)
 
-SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "naver-clova-ix/donut-base": "https://huggingface.co/naver-clova-ix/donut-base/resolve/main/config.json",
     # See all Donut models at https://huggingface.co/models?filter=donut-swin
 }

From e0adc82696100dd3a37124595b721bc03fca0f6c Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 7 Aug 2022 17:20:38 +0200
Subject: [PATCH 22/35] Fix another bug

---
 src/transformers/models/swin/modeling_swin.py     | 4 +---
 src/transformers/models/swinv2/modeling_swinv2.py | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index b25ca2956c62f..4d2b21a80d335 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -922,9 +922,7 @@ def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
         self.embeddings = SwinEmbeddings(config, use_mask_token=use_mask_token)
         self.encoder = SwinEncoder(config, self.embeddings.patch_grid)
 
-        self.layernorm = (
-            nn.LayerNorm(self.num_features, eps=config.layer_norm_eps) if config.add_final_layer_norm else None
-        )
+        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
         self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
 
         # Initialize weights and apply final processing
diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py
index 26cb4e2398dd9..7bc8d64d83964 100644
--- a/src/transformers/models/swinv2/modeling_swinv2.py
+++ b/src/transformers/models/swinv2/modeling_swinv2.py
@@ -1002,9 +1002,7 @@ def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
         self.embeddings = Swinv2Embeddings(config, use_mask_token=use_mask_token)
         self.encoder = Swinv2Encoder(config, self.embeddings.patch_grid)
 
-        self.layernorm = (
-            nn.LayerNorm(self.num_features, eps=config.layer_norm_eps) if config.add_final_layer_norm else None
-        )
+        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
         self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
 
         # Initialize weights and apply final processing

From 8ddc4c1e99d603d7bf7a8176d138c0cea4ad0a6c Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 8 Aug 2022 12:19:00 +0200
Subject: [PATCH 23/35] Remove deprecated method from docs

---
 docs/source/en/model_doc/donut.mdx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/en/model_doc/donut.mdx b/docs/source/en/model_doc/donut.mdx
index ae8ba351308af..845d0ac439364 100644
--- a/docs/source/en/model_doc/donut.mdx
+++ b/docs/source/en/model_doc/donut.mdx
@@ -111,7 +111,6 @@ See the [model hub](https://huggingface.co/models?filter=donut) to look for Donu
     - save_pretrained
     - batch_decode
     - decode
-    - as_target_processor
 
 ## DonutSwinModel
 

From 0f59956e857bf1e92cab819a444cf970ec0e00e3 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 8 Aug 2022 12:24:50 +0200
Subject: [PATCH 24/35] Make Swin and Swinv2 untouched

---
 src/transformers/models/swin/modeling_swin.py     | 3 +--
 src/transformers/models/swinv2/modeling_swinv2.py | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index 4d2b21a80d335..48c9b8cccf9ec 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -985,8 +985,7 @@ def forward(
         )
 
         sequence_output = encoder_outputs[0]
-        if self.layernorm is not None:
-            sequence_output = self.layernorm(sequence_output)
+        sequence_output = self.layernorm(sequence_output)
 
         pooled_output = None
         if self.pooler is not None:
diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py
index 7bc8d64d83964..52f836d5b91d3 100644
--- a/src/transformers/models/swinv2/modeling_swinv2.py
+++ b/src/transformers/models/swinv2/modeling_swinv2.py
@@ -1065,8 +1065,7 @@ def forward(
         )
 
         sequence_output = encoder_outputs[0]
-        if self.layernorm is not None:
-            sequence_output = self.layernorm(sequence_output)
+        sequence_output = self.layernorm(sequence_output)
 
         pooled_output = None
         if self.pooler is not None:

From b50074983c54b0a312f097a24aa6db6f6cd34a66 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 8 Aug 2022 15:23:07 +0200
Subject: [PATCH 25/35] Fix code examples

---
 docs/source/en/model_doc/donut.mdx | 81 ++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 15 deletions(-)

diff --git a/docs/source/en/model_doc/donut.mdx b/docs/source/en/model_doc/donut.mdx
index 845d0ac439364..88cb24b87b006 100644
--- a/docs/source/en/model_doc/donut.mdx
+++ b/docs/source/en/model_doc/donut.mdx
@@ -47,20 +47,69 @@ The [`DonutFeatureExtractor`] class is responsible for preprocessing the input i
 [`DonutProcessor`] wraps [`DonutFeatureExtractor`] and [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]
 into a single instance to both extract the input features and decode the predicted token ids.
 
+- Step-by-step Document Image Classification
+
+```py
+>>> import re
+
+>>> from transformers import DonutProcessor, VisionEncoderDecoderModel
+>>> from datasets import load_dataset
+>>> import torch
+
+>>> processor = DonutProcessor.from_pretrained("nielsr/donut-base-finetuned-rvlcdip")
+>>> model = VisionEncoderDecoderModel.from_pretrained("nielsr/donut-base-finetuned-rvlcdip")
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model.to(device)  # doctest: +IGNORE_RESULT
+
+>>> # load document image
+>>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+>>> image = dataset[1]["image"]
+
+>>> # prepare decoder inputs
+>>> task_prompt = "<s_rvlcdip>"
+>>> decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
+
+>>> pixel_values = processor(image, return_tensors="pt").pixel_values
+
+>>> outputs = model.generate(
+...     pixel_values.to(device),
+...     decoder_input_ids=decoder_input_ids.to(device),
+...     max_length=model.decoder.config.max_position_embeddings,
+...     early_stopping=True,
+...     pad_token_id=processor.tokenizer.pad_token_id,
+...     eos_token_id=processor.tokenizer.eos_token_id,
+...     use_cache=True,
+...     num_beams=1,
+...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
+...     return_dict_in_generate=True,
+... )
+
+>>> sequence = processor.batch_decode(outputs.sequences)[0]
+>>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+>>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+>>> print(sequence)
+<s_class><advertisement/></s_class>
+```
+
+We refer to the example notebooks regarding converting the model output back to JSON. 
+The code is exactly the same for document parsing, except that the task prompt is different (e.g. "<s_cord-v2>").
+Another example can be found below:
+
 - Step-by-step Document Visual Question Answering (DocVQA)
 
-``` py
+```py
 >>> import re
 
 >>> from transformers import DonutProcessor, VisionEncoderDecoderModel
 >>> from datasets import load_dataset
 >>> import torch
 
->>> processor = DonutProcessor.from_pretrained("nielsr/donut-base-finetuned-docvqa") 
+>>> processor = DonutProcessor.from_pretrained("nielsr/donut-base-finetuned-docvqa")
 >>> model = VisionEncoderDecoderModel.from_pretrained("nielsr/donut-base-finetuned-docvqa")
 
 >>> device = "cuda" if torch.cuda.is_available() else "cpu"
->>> model.to(device)
+>>> model.to(device)  # doctest: +IGNORE_RESULT
 
 >>> # load document image from the DocVQA dataset
 >>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
@@ -72,18 +121,20 @@ into a single instance to both extract the input features and decode the predict
 >>> prompt = task_prompt.replace("{user_input}", question)
 >>> decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
 
->>> pixel_values = processor(image, return_tensors="pt").pixel_values 
-
->>> outputs = model.generate(pixel_values.to(device),
-...                          decoder_input_ids=decoder_input_ids.to(device),
-...                          max_length=model.decoder.config.max_position_embeddings,
-...                          early_stopping=True,
-...                          pad_token_id=processor.tokenizer.pad_token_id,
-...                          eos_token_id=processor.tokenizer.eos_token_id,
-...                          use_cache=True,
-...                          num_beams=1,
-...                          bad_words_ids=[[processor.tokenizer.unk_token_id]],
-...                          return_dict_in_generate=True)
+>>> pixel_values = processor(image, return_tensors="pt").pixel_values
+
+>>> outputs = model.generate(
+...     pixel_values.to(device),
+...     decoder_input_ids=decoder_input_ids.to(device),
+...     max_length=model.decoder.config.max_position_embeddings,
+...     early_stopping=True,
+...     pad_token_id=processor.tokenizer.pad_token_id,
+...     eos_token_id=processor.tokenizer.eos_token_id,
+...     use_cache=True,
+...     num_beams=1,
+...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
+...     return_dict_in_generate=True,
+... )
 
 >>> sequence = processor.batch_decode(outputs.sequences)[0]
 >>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")

From 901baf4148a59786cf53c9f00f42813ccb4763f0 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 8 Aug 2022 15:48:41 +0200
Subject: [PATCH 26/35] Fix processor

---
 .../models/donut/processing_donut.py          | 37 ++++++++++++++++++-
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
index d56972872437b..80babfe92ab9a 100644
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -15,6 +15,7 @@
 """
 Processor class for Donut.
 """
+import warnings
 from contextlib import contextmanager
 
 from ...processing_utils import ProcessorMixin
@@ -41,6 +42,7 @@ class DonutProcessor(ProcessorMixin):
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
         self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
 
     def __call__(self, *args, **kwargs):
         """
@@ -49,7 +51,31 @@ def __call__(self, *args, **kwargs):
         [`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
         [`~DonutTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
         """
-        return self.current_processor(*args, **kwargs)
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        images = kwargs.pop("images", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            images = args[0]
+            args = args[1:]
+
+        if images is None and text is None:
+            raise ValueError("You need to specify either an `images` or `text` input to process.")
+
+        if images is not None:
+            inputs = self.feature_extractor(images, *args, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif images is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
 
     def batch_decode(self, *args, **kwargs):
         """
@@ -68,8 +94,15 @@ def decode(self, *args, **kwargs):
     @contextmanager
     def as_target_processor(self):
         """
-        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning Donut.
+        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning TrOCR.
         """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your images inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
         self.current_processor = self.tokenizer
         yield
         self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False

From bec1d783fd32881784490263f8ee657c4ea51520 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 8 Aug 2022 18:24:42 +0200
Subject: [PATCH 27/35] Update model_type to donut-swin

---
 src/transformers/models/auto/configuration_auto.py        | 6 ++++--
 src/transformers/models/auto/modeling_auto.py             | 2 +-
 src/transformers/models/donut/configuration_donut_swin.py | 2 +-
 utils/check_repo.py                                       | 1 +
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 6e3b47f4cafa6..2a831cbb0513c 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -56,7 +56,7 @@
         ("deit", "DeiTConfig"),
         ("detr", "DetrConfig"),
         ("distilbert", "DistilBertConfig"),
-        ("donut", "DonutSwinConfig"),
+        ("donut-swin", "DonutSwinConfig"),
         ("dpr", "DPRConfig"),
         ("dpt", "DPTConfig"),
         ("electra", "ElectraConfig"),
@@ -182,6 +182,7 @@
         ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("distilbert", "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("donut-swin", "DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("dpr", "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("dpt", "DPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("electra", "ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -305,7 +306,7 @@
         ("dialogpt", "DialoGPT"),
         ("distilbert", "DistilBERT"),
         ("dit", "DiT"),
-        ("donut", "Donut"),
+        ("donut-swin", "Donut"),
         ("dpr", "DPR"),
         ("dpt", "DPT"),
         ("electra", "ELECTRA"),
@@ -422,6 +423,7 @@
         ("data2vec-audio", "data2vec"),
         ("data2vec-text", "data2vec"),
         ("data2vec-vision", "data2vec"),
+        ("donut-swin", "donut"),
     ]
 )
 
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index bfaa099beb134..0e026cb48d0c0 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -56,7 +56,7 @@
         ("deit", "DeiTModel"),
         ("detr", "DetrModel"),
         ("distilbert", "DistilBertModel"),
-        ("donut", "DonutSwinModel"),
+        ("donut-swin", "DonutSwinModel"),
         ("dpr", "DPRQuestionEncoder"),
         ("dpt", "DPTModel"),
         ("electra", "ElectraModel"),
diff --git a/src/transformers/models/donut/configuration_donut_swin.py b/src/transformers/models/donut/configuration_donut_swin.py
index 5c65f8a896fc2..d3316bdc79f68 100644
--- a/src/transformers/models/donut/configuration_donut_swin.py
+++ b/src/transformers/models/donut/configuration_donut_swin.py
@@ -87,7 +87,7 @@ class DonutSwinConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "donut"
+    model_type = "donut-swin"
 
     attribute_map = {
         "num_attention_heads": "num_heads",
diff --git a/utils/check_repo.py b/utils/check_repo.py
index d2271e87ebf17..254467113d6cb 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -206,6 +206,7 @@
         ("data2vec-text", "data2vec"),
         ("data2vec-audio", "data2vec"),
         ("data2vec-vision", "data2vec"),
+        ("donut-swin", "donut"),
     ]
 )
 

From 159a4591e58b80bd68ee4380d972a072d1c7142f Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 10 Aug 2022 12:41:47 +0200
Subject: [PATCH 28/35] Add feature extractor tests, add token2json method,
 improve feature extractor

---
 src/transformers/image_utils.py               |  22 ++
 .../models/donut/feature_extraction_donut.py  |   6 +-
 .../models/donut/processing_donut.py          |  48 +++++
 .../donut/test_feature_extraction_donut.py    | 199 ++++++++++++++++++
 4 files changed, 272 insertions(+), 3 deletions(-)
 create mode 100644 tests/models/donut/test_feature_extraction_donut.py

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index dd7bb326993d3..e5a395341c003 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -376,3 +376,25 @@ def flip_channel_order(self, image):
             image = self.to_numpy_array(image)
 
         return image[::-1, :, :]
+
+    def rotate(self, image, angle, resample=PIL.Image.NEAREST, expand=0, center=None, translate=None, fillcolor=None):
+        """
+        Returns a rotated copy of `image`. This method returns a copy of `image`, rotated the given number of degrees
+        counter clockwise around its centre.
+
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to rotate. If `np.ndarray` or `torch.Tensor`, will be converted to `PIL.Image.Image` before
+                rotating.
+
+        Returns:
+            image: A rotated `PIL.Image.Image`.
+        """
+        self._ensure_format_supported(image)
+
+        if not isinstance(image, PIL.Image.Image):
+            image = self.to_pil_image(image)
+
+        return image.rotate(
+            angle, resample=resample, expand=expand, center=center, translate=translate, fillcolor=fillcolor
+        )
diff --git a/src/transformers/models/donut/feature_extraction_donut.py b/src/transformers/models/donut/feature_extraction_donut.py
index 6210fb390bcde..083eb2386c0f4 100644
--- a/src/transformers/models/donut/feature_extraction_donut.py
+++ b/src/transformers/models/donut/feature_extraction_donut.py
@@ -88,12 +88,12 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
 
-    def rotate(self, image, size):
+    def rotate_image(self, image, size):
         if not isinstance(image, Image.Image):
             image = self.to_pil_image(image)
 
         if (size[1] > size[0] and image.width > image.height) or (size[1] < size[0] and image.width < image.height):
-            image = image.rotate(angle=-90, expand=True)
+            image = self.rotate(image, angle=-90, expand=True)
 
         return image
 
@@ -185,7 +185,7 @@ def __call__(
 
         # transformations (rotating + resizing + padding + normalization)
         if self.do_align_long_axis:
-            images = [self.rotate(image, self.size) for image in images]
+            images = [self.rotate_image(image, self.size) for image in images]
         if self.do_resize and self.size is not None:
             images = [
                 self.resize_and_thumbnail(image=image, size=self.size, resample=self.resample) for image in images
diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
index 80babfe92ab9a..1b00d894bd087 100644
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -15,6 +15,7 @@
 """
 Processor class for Donut.
 """
+import re
 import warnings
 from contextlib import contextmanager
 
@@ -106,3 +107,50 @@ def as_target_processor(self):
         yield
         self.current_processor = self.feature_extractor
         self._in_target_context_manager = False
+
+    def token2json(self, tokens, is_inner_value=False):
+        """
+        Convert a (generated) token sequence into an ordered JSON format.
+        """
+        output = dict()
+
+        while tokens:
+            start_token = re.search(r"<s_(.*?)>", tokens, re.IGNORECASE)
+            if start_token is None:
+                break
+            key = start_token.group(1)
+            end_token = re.search(rf"</s_{key}>", tokens, re.IGNORECASE)
+            start_token = start_token.group()
+            if end_token is None:
+                tokens = tokens.replace(start_token, "")
+            else:
+                end_token = end_token.group()
+                start_token_escaped = re.escape(start_token)
+                end_token_escaped = re.escape(end_token)
+                content = re.search(f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE)
+                if content is not None:
+                    content = content.group(1).strip()
+                    if r"<s_" in content and r"</s_" in content:  # non-leaf node
+                        value = self.token2json(content, is_inner_value=True)
+                        if value:
+                            if len(value) == 1:
+                                value = value[0]
+                            output[key] = value
+                    else:  # leaf nodes
+                        output[key] = []
+                        for leaf in content.split(r"<sep/>"):
+                            leaf = leaf.strip()
+                            if leaf in self.tokenizer.get_added_vocab() and leaf[0] == "<" and leaf[-2:] == "/>":
+                                leaf = leaf[1:-2]  # for categorical special tokens
+                            output[key].append(leaf)
+                        if len(output[key]) == 1:
+                            output[key] = output[key][0]
+
+                tokens = tokens[tokens.find(end_token) + len(end_token) :].strip()
+                if tokens[:6] == r"<sep/>":  # non-leaf nodes
+                    return [output] + self.token2json(tokens[6:], is_inner_value=True)
+
+        if len(output):
+            return [output] if is_inner_value else output
+        else:
+            return [] if is_inner_value else {"text_sequence": tokens}
diff --git a/tests/models/donut/test_feature_extraction_donut.py b/tests/models/donut/test_feature_extraction_donut.py
new file mode 100644
index 0000000000000..6391ad8777737
--- /dev/null
+++ b/tests/models/donut/test_feature_extraction_donut.py
@@ -0,0 +1,199 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import DonutFeatureExtractor
+
+
+class DonutFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=[20, 18],
+        do_align_axis=False,
+        do_pad=True,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_align_axis = do_align_axis
+        self.do_pad = do_pad
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_align_long_axis": self.do_align_axis,
+            "do_pad": self.do_pad,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std, 
+        }
+
+
+@require_torch
+@require_vision
+class DonutFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+
+    feature_extraction_class = DonutFeatureExtractor if is_vision_available() else None
+
+    def setUp(self):
+        self.feature_extract_tester = DonutFeatureExtractionTester(self)
+
+    @property
+    def feat_extract_dict(self):
+        return self.feature_extract_tester.prepare_feat_extract_dict()
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "do_resize"))
+        self.assertTrue(hasattr(feature_extractor, "size"))
+        self.assertTrue(hasattr(feature_extractor, "do_align_long_axis"))
+        self.assertTrue(hasattr(feature_extractor, "do_pad"))
+        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
+        self.assertTrue(hasattr(feature_extractor, "image_mean"))
+        self.assertTrue(hasattr(feature_extractor, "image_std"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PIL images
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size[1],
+                self.feature_extract_tester.size[0],
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size[1],
+                self.feature_extract_tester.size[0],
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random numpy tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size[1],
+                self.feature_extract_tester.size[0],
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size[1],
+                self.feature_extract_tester.size[0],
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size[1],
+                self.feature_extract_tester.size[0],
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size[1],
+                self.feature_extract_tester.size[0],
+            ),
+        )
\ No newline at end of file

From 1fe58994c6af455313fae2b463351129e248f614 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 10 Aug 2022 15:19:21 +0200
Subject: [PATCH 29/35] Fix failing tests, remove integration test

---
 src/transformers/utils/fx.py                  |  2 +-
 .../donut/test_feature_extraction_donut.py    |  4 ++--
 .../models/donut/test_modeling_donut_swin.py  | 19 ++-----------------
 3 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index 066faec9b58e7..3c3babd403778 100644
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -105,7 +105,7 @@ def _generate_supported_model_class_names(
     "deberta",
     "deberta-v2",
     "distilbert",
-    "donut",
+    "donut-swin",
     "electra",
     "gpt2",
     "gpt_neo",
diff --git a/tests/models/donut/test_feature_extraction_donut.py b/tests/models/donut/test_feature_extraction_donut.py
index 6391ad8777737..9807d5b46cb23 100644
--- a/tests/models/donut/test_feature_extraction_donut.py
+++ b/tests/models/donut/test_feature_extraction_donut.py
@@ -72,7 +72,7 @@ def prepare_feat_extract_dict(self):
             "do_pad": self.do_pad,
             "do_normalize": self.do_normalize,
             "image_mean": self.image_mean,
-            "image_std": self.image_std, 
+            "image_std": self.image_std,
         }
 
 
@@ -196,4 +196,4 @@ def test_call_pytorch(self):
                 self.feature_extract_tester.size[1],
                 self.feature_extract_tester.size[0],
             ),
-        )
\ No newline at end of file
+        )
diff --git a/tests/models/donut/test_modeling_donut_swin.py b/tests/models/donut/test_modeling_donut_swin.py
index f6772ac233d7e..f909d961880a9 100644
--- a/tests/models/donut/test_modeling_donut_swin.py
+++ b/tests/models/donut/test_modeling_donut_swin.py
@@ -22,8 +22,8 @@
 import unittest
 
 from transformers import DonutSwinConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_torch_available, is_torch_fx_available, is_vision_available
+from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.utils import is_torch_available, is_torch_fx_available
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
@@ -36,9 +36,6 @@
     from transformers import DonutSwinModel
     from transformers.models.donut.modeling_donut_swin import DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST
 
-if is_vision_available():
-    from transformers import AutoFeatureExtractor
-
 if is_torch_fx_available():
     from transformers.utils.fx import symbolic_trace
 
@@ -465,15 +462,3 @@ def flatten_output(output):
                         torch.allclose(model_output[i], loaded_output[i]),
                         f"serialized model {i}th output doesn't match model {i}th output for {model_class}",
                     )
-
-
-@require_vision
-@require_torch
-class DonutSwinModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained("naver-clova-ix/donut-base") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        raise NotImplementedError("To do")

From 4173d17adf184e1833c8beef57c1d00bd9d957e5 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 11 Aug 2022 12:25:27 +0200
Subject: [PATCH 30/35] Add do_thumbnail for consistency

---
 .../models/donut/feature_extraction_donut.py  | 22 ++++++++++++-------
 .../donut/test_feature_extraction_donut.py    |  4 ++++
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/donut/feature_extraction_donut.py b/src/transformers/models/donut/feature_extraction_donut.py
index 083eb2386c0f4..09bf3a6ad1c15 100644
--- a/src/transformers/models/donut/feature_extraction_donut.py
+++ b/src/transformers/models/donut/feature_extraction_donut.py
@@ -42,8 +42,7 @@ class DonutFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin)
 
     Args:
         do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the shorter edge of the input to the minimum value of a certain `size`, and thumbnail the
-            input to the given `size`.
+            Whether to resize the shorter edge of the input to the minimum value of a certain `size`.
         size (`Tuple(int)`, *optional*, defaults to [1920, 2560]):
             Resize the shorter edge of the input to the minimum value of the given size. Should be a tuple of (width,
             height). Only has an effect if `do_resize` is set to `True`.
@@ -51,6 +50,8 @@ class DonutFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin)
             An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
             `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`. Only has an effect
             if `do_resize` is set to `True`.
+        do_thumbnail (`bool`, *optional*, defaults to `True`):
+            Whether to thumbnail the input to the given `size`.
         do_align_long_axis (`bool`, *optional*, defaults to `False`):
             Whether to rotate the input if the height is greater than width.
         do_pad (`bool`, *optional*, defaults to `True`):
@@ -71,6 +72,7 @@ def __init__(
         do_resize=True,
         size=[1920, 2560],
         resample=Image.BILINEAR,
+        do_thumbnail=True,
         do_align_long_axis=False,
         do_pad=True,
         do_normalize=True,
@@ -82,6 +84,7 @@ def __init__(
         self.do_resize = do_resize
         self.size = size
         self.resample = resample
+        self.do_thumbnail = do_thumbnail
         self.do_align_long_axis = do_align_long_axis
         self.do_pad = do_pad
         self.do_normalize = do_normalize
@@ -97,10 +100,10 @@ def rotate_image(self, image, size):
 
         return image
 
-    def resize_and_thumbnail(self, image, size, resample):
-        # 1. resize the shorter edge of the image to `min(size)`
-        image = self.resize(image, size=min(size), resample=resample, default_to_square=False)
-        # 2. create a thumbnail
+    def thumbnail(self, image, size):
+        if not isinstance(image, Image.Image):
+            image = self.to_pil_image(image)
+
         image.thumbnail((size[0], size[1]))
 
         return image
@@ -183,13 +186,16 @@ def __call__(
         if not is_batched:
             images = [images]
 
-        # transformations (rotating + resizing + padding + normalization)
+        # transformations (rotating + resizing + thumbnailing + padding + normalization)
         if self.do_align_long_axis:
             images = [self.rotate_image(image, self.size) for image in images]
         if self.do_resize and self.size is not None:
             images = [
-                self.resize_and_thumbnail(image=image, size=self.size, resample=self.resample) for image in images
+                self.resize(image=image, size=min(self.size), resample=self.resample, default_to_square=False)
+                for image in images
             ]
+        if self.do_thumbnail and self.size is not None:
+            images = [self.thumbnail(image=image, size=self.size) for image in images]
         if self.do_pad and self.size is not None:
             images = [self.pad(image=image, size=self.size, random_padding=random_padding) for image in images]
         if self.do_normalize:
diff --git a/tests/models/donut/test_feature_extraction_donut.py b/tests/models/donut/test_feature_extraction_donut.py
index 9807d5b46cb23..38ccbf2075a9b 100644
--- a/tests/models/donut/test_feature_extraction_donut.py
+++ b/tests/models/donut/test_feature_extraction_donut.py
@@ -44,6 +44,7 @@ def __init__(
         max_resolution=400,
         do_resize=True,
         size=[20, 18],
+        do_thumbnail=True,
         do_align_axis=False,
         do_pad=True,
         do_normalize=True,
@@ -58,6 +59,7 @@ def __init__(
         self.max_resolution = max_resolution
         self.do_resize = do_resize
         self.size = size
+        self.do_thumbnail = do_thumbnail
         self.do_align_axis = do_align_axis
         self.do_pad = do_pad
         self.do_normalize = do_normalize
@@ -68,6 +70,7 @@ def prepare_feat_extract_dict(self):
         return {
             "do_resize": self.do_resize,
             "size": self.size,
+            "do_thumbnail": self.do_thumbnail,
             "do_align_long_axis": self.do_align_axis,
             "do_pad": self.do_pad,
             "do_normalize": self.do_normalize,
@@ -93,6 +96,7 @@ def test_feat_extract_properties(self):
         feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
         self.assertTrue(hasattr(feature_extractor, "do_resize"))
         self.assertTrue(hasattr(feature_extractor, "size"))
+        self.assertTrue(hasattr(feature_extractor, "do_thumbnail"))
         self.assertTrue(hasattr(feature_extractor, "do_align_long_axis"))
         self.assertTrue(hasattr(feature_extractor, "do_pad"))
         self.assertTrue(hasattr(feature_extractor, "do_normalize"))

From 82cc409d995491f2ca3d4427aceab340230e23b6 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 11 Aug 2022 15:21:57 +0200
Subject: [PATCH 31/35] Improve code examples

---
 docs/source/en/model_doc/donut.mdx | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/model_doc/donut.mdx b/docs/source/en/model_doc/donut.mdx
index 88cb24b87b006..77fc88b14aad2 100644
--- a/docs/source/en/model_doc/donut.mdx
+++ b/docs/source/en/model_doc/donut.mdx
@@ -88,12 +88,12 @@ into a single instance to both extract the input features and decode the predict
 >>> sequence = processor.batch_decode(outputs.sequences)[0]
 >>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
 >>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
->>> print(sequence)
-<s_class><advertisement/></s_class>
+>>> print(processor.token2json(sequence))
+{'class': 'advertisement'}
 ```
 
-We refer to the example notebooks regarding converting the model output back to JSON. 
 The code is exactly the same for document parsing, except that the task prompt is different (e.g. "<s_cord-v2>").
+
 Another example can be found below:
 
 - Step-by-step Document Visual Question Answering (DocVQA)
@@ -139,12 +139,16 @@ Another example can be found below:
 >>> sequence = processor.batch_decode(outputs.sequences)[0]
 >>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
 >>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
->>> print(sequence)
-<s_question> When is the coffee break?</s_question><s_answer> 11-14 to 11:39 a.m.</s_answer>
+>>> print(processor.token2json(sequence))
+{'question': 'When is the coffee break?', 'answer': '11-14 to 11:39 a.m.'}
 ```
 
 See the [model hub](https://huggingface.co/models?filter=donut) to look for Donut checkpoints.
 
+## Training
+
+We refer to the [tutorial notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/donut).
+
 ## DonutSwinConfig
 
 [[autodoc]] DonutSwinConfig

From 18b2aed2c8e5bc0ca35bb2f75f5676b7f9d62f62 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 11 Aug 2022 15:28:32 +0200
Subject: [PATCH 32/35] Add code example for document parsing

---
 docs/source/en/model_doc/donut.mdx | 45 ++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/donut.mdx b/docs/source/en/model_doc/donut.mdx
index 77fc88b14aad2..90600f7648f61 100644
--- a/docs/source/en/model_doc/donut.mdx
+++ b/docs/source/en/model_doc/donut.mdx
@@ -92,9 +92,50 @@ into a single instance to both extract the input features and decode the predict
 {'class': 'advertisement'}
 ```
 
-The code is exactly the same for document parsing, except that the task prompt is different (e.g. "<s_cord-v2>").
+- Step-by-step Document Parsing
 
-Another example can be found below:
+```py
+>>> import re
+
+>>> from transformers import DonutProcessor, VisionEncoderDecoderModel
+>>> from datasets import load_dataset
+>>> import torch
+
+>>> processor = DonutProcessor.from_pretrained("nielsr/donut-base-finetuned-cord-v2")
+>>> model = VisionEncoderDecoderModel.from_pretrained("nielsr/donut-base-finetuned-cord-v2")
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model.to(device)  # doctest: +IGNORE_RESULT
+
+>>> # load document image
+>>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+>>> image = dataset[2]["image"]
+
+>>> # prepare decoder inputs
+>>> task_prompt = "<s_cord-v2>"
+>>> decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
+
+>>> pixel_values = processor(image, return_tensors="pt").pixel_values
+
+>>> outputs = model.generate(
+...     pixel_values.to(device),
+...     decoder_input_ids=decoder_input_ids.to(device),
+...     max_length=model.decoder.config.max_position_embeddings,
+...     early_stopping=True,
+...     pad_token_id=processor.tokenizer.pad_token_id,
+...     eos_token_id=processor.tokenizer.eos_token_id,
+...     use_cache=True,
+...     num_beams=1,
+...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
+...     return_dict_in_generate=True,
+... )
+
+>>> sequence = processor.batch_decode(outputs.sequences)[0]
+>>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+>>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+>>> print(processor.token2json(sequence))
+{'menu': {'nm': 'CINNAMON SUGAR', 'unitprice': '17,000', 'cnt': '1 x', 'price': '17,000'}, 'sub_total': {'subtotal_price': '17,000'}, 'total': {'total_price': '17,000', 'cashprice': '20,000', 'changeprice': '3,000'}}
+```
 
 - Step-by-step Document Visual Question Answering (DocVQA)
 

From 34c9f38ca12153e7ca2e374ca62fee9b5b80ffdb Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 11 Aug 2022 16:19:53 +0200
Subject: [PATCH 33/35] Add DonutSwin to MODEL_NAMES_MAPPING

---
 docs/source/en/index.mdx                           | 2 +-
 src/transformers/models/auto/configuration_auto.py | 3 ++-
 utils/check_copies.py                              | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 3c722949fcdf9..257eba8171ed1 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -225,7 +225,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            Donut            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          DonutSwin          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2a831cbb0513c..c9e6156a3843d 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -306,7 +306,8 @@
         ("dialogpt", "DialoGPT"),
         ("distilbert", "DistilBERT"),
         ("dit", "DiT"),
-        ("donut-swin", "Donut"),
+        ("donut", "Donut"),
+        ("donut-swin", "DonutSwin"),
         ("dpr", "DPR"),
         ("dpt", "DPT"),
         ("electra", "ELECTRA"),
diff --git a/utils/check_copies.py b/utils/check_copies.py
index e2e0e1a53e433..7d57173654468 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -471,6 +471,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "Data2VecAudio": "Data2Vec",
     "Data2VecText": "Data2Vec",
     "Data2VecVision": "Data2Vec",
+    "DonutSwin": "Donut",
     "Marian": "MarianMT",
     "OpenAI GPT-2": "GPT-2",
     "OpenAI GPT": "GPT",

From 397cb63498f520e8c8a839f4a1eb160bae2ad15f Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 11 Aug 2022 19:56:44 +0200
Subject: [PATCH 34/35] Add model to appropriate place in toctree

---
 docs/source/en/_toctree.yml                               | 4 ++--
 src/transformers/models/donut/convert_donut_to_pytorch.py | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 2f8bce71ff220..78137d2c8a74c 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -218,8 +218,6 @@
         title: DialoGPT
       - local: model_doc/distilbert
         title: DistilBERT
-      - local: model_doc/donut
-        title: Donut
       - local: model_doc/dpr
         title: DPR
       - local: model_doc/electra
@@ -429,6 +427,8 @@
         title: CLIP
       - local: model_doc/data2vec
         title: Data2Vec
+      - local: model_doc/donut
+        title: Donut
       - local: model_doc/flava
         title: FLAVA
       - local: model_doc/groupvit
diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index c3eabc83135e7..507f10cb776cf 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -164,8 +164,15 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
         task_prompt = task_prompt.replace("{user_input}", question)
     elif model_name == "naver-clova-ix/donut-base-finetuned-rvlcdip":
         task_prompt = "<s_rvlcdip>"
+    elif model_name in [
+        "naver-clova-ix/donut-base-finetuned-cord-v1",
+        "naver-clova-ix/donut-base-finetuned-cord-v1-2560",
+    ]:
+        task_prompt = "<s_cord>"
     elif model_name == "naver-clova-ix/donut-base-finetuned-cord-v2":
         task_prompt = "s_cord-v2>"
+    elif model_name == "naver-clova-ix/donut-base-finetuned-zhtrainticket":
+        task_prompt = "<s_zhtrainticket>"
     elif model_name in ["naver-clova-ix/donut-proto", "naver-clova-ix/donut-base"]:
         # use a random prompt
         task_prompt = "hello world"

From c6bb809fac9b00596dbac1e8f45edd4b5835de6f Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 12 Aug 2022 16:03:28 +0200
Subject: [PATCH 35/35] Update namespace to appropriate organization

---
 docs/source/en/model_doc/donut.mdx             | 12 ++++++------
 .../test_modeling_vision_encoder_decoder.py    | 18 ++++++++++++------
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/docs/source/en/model_doc/donut.mdx b/docs/source/en/model_doc/donut.mdx
index 90600f7648f61..9c9973be022e7 100644
--- a/docs/source/en/model_doc/donut.mdx
+++ b/docs/source/en/model_doc/donut.mdx
@@ -56,8 +56,8 @@ into a single instance to both extract the input features and decode the predict
 >>> from datasets import load_dataset
 >>> import torch
 
->>> processor = DonutProcessor.from_pretrained("nielsr/donut-base-finetuned-rvlcdip")
->>> model = VisionEncoderDecoderModel.from_pretrained("nielsr/donut-base-finetuned-rvlcdip")
+>>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
+>>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
 
 >>> device = "cuda" if torch.cuda.is_available() else "cpu"
 >>> model.to(device)  # doctest: +IGNORE_RESULT
@@ -101,8 +101,8 @@ into a single instance to both extract the input features and decode the predict
 >>> from datasets import load_dataset
 >>> import torch
 
->>> processor = DonutProcessor.from_pretrained("nielsr/donut-base-finetuned-cord-v2")
->>> model = VisionEncoderDecoderModel.from_pretrained("nielsr/donut-base-finetuned-cord-v2")
+>>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
+>>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
 
 >>> device = "cuda" if torch.cuda.is_available() else "cpu"
 >>> model.to(device)  # doctest: +IGNORE_RESULT
@@ -146,8 +146,8 @@ into a single instance to both extract the input features and decode the predict
 >>> from datasets import load_dataset
 >>> import torch
 
->>> processor = DonutProcessor.from_pretrained("nielsr/donut-base-finetuned-docvqa")
->>> model = VisionEncoderDecoderModel.from_pretrained("nielsr/donut-base-finetuned-docvqa")
+>>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
+>>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
 
 >>> device = "cuda" if torch.cuda.is_available() else "cpu"
 >>> model.to(device)  # doctest: +IGNORE_RESULT
diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
index e94c3c2b44df0..7570888097c53 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@@ -790,8 +790,10 @@ def generate_step(pixel_values):
 class DonutModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_docvqa(self):
-        processor = DonutProcessor.from_pretrained("nielsr/donut-base-finetuned-docvqa")
-        model = VisionEncoderDecoderModel.from_pretrained("nielsr/donut-base-finetuned-docvqa").to(torch_device)
+        processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
+        model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa").to(
+            torch_device
+        )
 
         dataset = load_dataset("hf-internal-testing/example-documents", split="test")
         image = dataset[0]["image"]
@@ -852,8 +854,10 @@ def test_inference_docvqa(self):
 
     @slow
     def test_inference_cordv2(self):
-        processor = DonutProcessor.from_pretrained("nielsr/donut-base-finetuned-cord-v2")
-        model = VisionEncoderDecoderModel.from_pretrained("nielsr/donut-base-finetuned-cord-v2").to(torch_device)
+        processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
+        model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2").to(
+            torch_device
+        )
 
         dataset = load_dataset("hf-internal-testing/example-documents", split="test")
         image = dataset[2]["image"]
@@ -914,8 +918,10 @@ def test_inference_cordv2(self):
 
     @slow
     def test_inference_rvlcdip(self):
-        processor = DonutProcessor.from_pretrained("nielsr/donut-base-finetuned-rvlcdip")
-        model = VisionEncoderDecoderModel.from_pretrained("nielsr/donut-base-finetuned-rvlcdip").to(torch_device)
+        processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
+        model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip").to(
+            torch_device
+        )
 
         dataset = load_dataset("hf-internal-testing/example-documents", split="test")
         image = dataset[1]["image"]