From 885e4c32f4630083f5479b1651d7992b8557d18a Mon Sep 17 00:00:00 2001 From: NielsRogge Date: Fri, 14 Oct 2022 08:33:31 +0000 Subject: [PATCH 01/14] First draft --- README.md | 1 + README_es.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/_toctree.yml | 2 + docs/source/en/index.mdx | 2 + .../source/en/model_doc/table-transformer.mdx | 53 + docs/source/en/serialization.mdx | 1 + src/transformers/__init__.py | 16 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + .../models/auto/feature_extraction_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 3 + .../models/table_transformer/__init__.py | 69 + .../configuration_table_transformer.py | 239 ++ ..._original_pytorch_checkpoint_to_pytorch.py | 286 +++ .../modeling_table_transformer.py | 1945 +++++++++++++++++ .../utils/dummy_timm_and_vision_objects.py | 24 + tests/models/table_transformer/__init__.py | 0 .../test_modeling_table_transformer.py | 524 +++++ utils/check_repo.py | 2 + 22 files changed, 3176 insertions(+) create mode 100644 docs/source/en/model_doc/table-transformer.mdx create mode 100644 src/transformers/models/table_transformer/__init__.py create mode 100644 src/transformers/models/table_transformer/configuration_table_transformer.py create mode 100644 src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py create mode 100644 src/transformers/models/table_transformer/modeling_table_transformer.py create mode 100644 tests/models/table_transformer/__init__.py create mode 100644 tests/models/table_transformer/test_modeling_table_transformer.py diff --git a/README.md b/README.md index db24ce9d6a2d5..5f0173c6cc640 100644 --- a/README.md +++ b/README.md @@ -375,6 +375,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. +1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table_transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). diff --git a/README_es.md b/README_es.md index 733b22aba55a5..8c64b19a2ca5e 100644 --- a/README_es.md +++ b/README_es.md @@ -375,6 +375,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. +1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table_transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). diff --git a/README_ko.md b/README_ko.md index 8ec02629d7ac5..14bad0088fb5f 100644 --- a/README_ko.md +++ b/README_ko.md @@ -325,6 +325,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. +1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table_transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). diff --git a/README_zh-hans.md b/README_zh-hans.md index e8a68ca9f9d03..279e4853bd819 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -349,6 +349,7 @@ conda install -c huggingface transformers 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (来自 Microsoft) 伴随论文 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 由 Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 发布。 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。 +1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table_transformer)** (来自 Microsoft Research) 伴随论文 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 由 Brandon Smock, Rohith Pesala, Robin Abraham 发布。 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). diff --git a/README_zh-hant.md b/README_zh-hant.md index 281001dbef7f1..3cc42138c2674 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -361,6 +361,7 @@ conda install -c huggingface transformers 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. +1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table_transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 1083d908b79c8..d9607c07b42c0 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -412,6 +412,8 @@ title: Swin Transformer - local: model_doc/swinv2 title: Swin Transformer V2 + - local: model_doc/table_transformer + title: Table Transformer - local: model_doc/van title: VAN - local: model_doc/videomae diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index 8465f1909841f..164d6afab4aaa 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -164,6 +164,7 @@ The documentation is organized into five sections: 1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. 1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. +1. **[Table Transformer](model_doc/table_transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. 1. **[Time Series Transformer](model_doc/time_series_transformer)** (from HuggingFace). @@ -313,6 +314,7 @@ Flax), PyTorch, and/or TensorFlow. | Swin Transformer | ❌ | ❌ | ✅ | ✅ | ❌ | | Swin Transformer V2 | ❌ | ❌ | ✅ | ❌ | ❌ | | T5 | ✅ | ✅ | ✅ | ✅ | ✅ | +| Table Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | | TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ | | Time Series Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | | Trajectory Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/table-transformer.mdx b/docs/source/en/model_doc/table-transformer.mdx new file mode 100644 index 0000000000000..0ed63f462b12f --- /dev/null +++ b/docs/source/en/model_doc/table-transformer.mdx @@ -0,0 +1,53 @@ + + +# Table Transformer + +## Overview + +The Table Transformer model was proposed in [PubTables-1M: Towards comprehensive table extraction from unstructured documents](https://arxiv.org/abs/2110.00061) by +Brandon Smock, Rohith Pesala, Robin Abraham. The authors introduce a new dataset, PubTables-1M, to benchmark progress in table extraction from unstructured documents, +as well as table structure recognition and functional analysis. The authors train 2 [DETR](detr) models, one for table detection and one for table structure recognition, dubbed Table Transformers. + +The abstract from the paper is the following: + +*Recently, significant progress has been made applying machine learning to the problem of table structure inference and extraction from unstructured documents. +However, one of the greatest challenges remains the creation of datasets with complete, unambiguous ground truth at scale. To address this, we develop a new, more +comprehensive dataset for table extraction, called PubTables-1M. PubTables-1M contains nearly one million tables from scientific articles, supports multiple input +modalities, and contains detailed header and location information for table structures, making it useful for a wide variety of modeling approaches. It also addresses a significant +source of ground truth inconsistency observed in prior datasets called oversegmentation, using a novel canonicalization procedure. We demonstrate that these improvements lead to a +significant increase in training performance and a more reliable estimate of model performance at evaluation for table structure recognition. Further, we show that transformer-based +object detection models trained on PubTables-1M produce excellent results for all three tasks of detection, structure recognition, and functional analysis without the need for any +special customization for these tasks.* + +Tips: + +- The authors released 2 models, one for table detection in documents, one for table structure recognition (the task of recognizing the individual rows, columns etc. in a table). +- Usage of these models is identical to other object detectors in the library, such as [`DetrForObjectDetection`]. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be +found [here](https://github.com/microsoft/table-transformer). + + +## TableTransformerConfig + +[[autodoc]] TableTransformerConfig + +## TableTransformerModel + +[[autodoc]] TableTransformerModel + - forward + +## TableTransformerForObjectDetection + +[[autodoc]] TableTransformerForObjectDetection + - forward diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx index 7c89cac443194..269a7271fda0c 100644 --- a/docs/source/en/serialization.mdx +++ b/docs/source/en/serialization.mdx @@ -96,6 +96,7 @@ Ready-made configurations include the following architectures: - SqueezeBERT - Swin Transformer - T5 +- Table Transformer - Vision Encoder decoder - ViT - XLM diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 123d2c6cd412a..b30d8f719f3a0 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -335,6 +335,7 @@ "models.swin": ["SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "SwinConfig"], "models.swinv2": ["SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Swinv2Config"], "models.t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config"], + "models.table_transformer": ["TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TableTransformerConfig"], "models.tapas": ["TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP", "TapasConfig", "TapasTokenizer"], "models.tapex": ["TapexTokenizer"], "models.time_series_transformer": [ @@ -738,6 +739,14 @@ "DetrPreTrainedModel", ] ) + _import_structure["models.table_transformer"].extend( + [ + "TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", + "TableTransformerForObjectDetection", + "TableTransformerModel", + "TableTransformerPreTrainedModel", + ] + ) _import_structure["models.conditional_detr"].extend( [ "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -3365,6 +3374,7 @@ from .models.swin import SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, SwinConfig from .models.swinv2 import SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP, Swinv2Config from .models.t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config + from .models.table_transformer import TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TableTransformerConfig from .models.tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig, TapasTokenizer from .models.tapex import TapexTokenizer from .models.time_series_transformer import ( @@ -3717,6 +3727,12 @@ DetrModel, DetrPreTrainedModel, ) + from .models.table_transformer import ( + TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, + TableTransformerForObjectDetection, + TableTransformerModel, + TableTransformerPreTrainedModel, + ) try: if not is_scatter_available(): diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 1f4c68a38c6dd..86a775a1eb2b8 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -138,6 +138,7 @@ swin, swinv2, t5, + table_transformer, tapas, tapex, time_series_transformer, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 6b76a1ed9f780..2973e5574d6a4 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -134,6 +134,7 @@ ("swin", "SwinConfig"), ("swinv2", "Swinv2Config"), ("t5", "T5Config"), + ("table-transformer", "TableTransformerConfig"), ("tapas", "TapasConfig"), ("time_series_transformer", "TimeSeriesTransformerConfig"), ("trajectory_transformer", "TrajectoryTransformerConfig"), @@ -265,6 +266,7 @@ ("swin", "SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("swinv2", "SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("t5", "T5_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("table-transformer", "TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("tapas", "TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -417,6 +419,7 @@ ("swinv2", "Swin Transformer V2"), ("t5", "T5"), ("t5v1.1", "T5v1.1"), + ("table-transformer", "Table Transformer"), ("tapas", "TAPAS"), ("tapex", "TAPEX"), ("time_series_transformer", "Time Series Transformer"), diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index 586498ee81c5a..4c4d27a31be6b 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -69,6 +69,7 @@ ("speech_to_text", "Speech2TextFeatureExtractor"), ("swin", "ViTFeatureExtractor"), ("swinv2", "ViTFeatureExtractor"), + ("table-transformer", "TableTransformerFeatureExtractor"), ("van", "ConvNextFeatureExtractor"), ("videomae", "VideoMAEFeatureExtractor"), ("vilt", "ViltFeatureExtractor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 6713fd2cdf05b..4531840081b58 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -130,6 +130,7 @@ ("swin", "SwinModel"), ("swinv2", "Swinv2Model"), ("t5", "T5Model"), + ("table-transformer", "TableTransformerModel"), ("tapas", "TapasModel"), ("time_series_transformer", "TimeSeriesTransformerModel"), ("trajectory_transformer", "TrajectoryTransformerModel"), @@ -384,6 +385,7 @@ # Do not add new models here, this class will be deprecated in the future. # Model for Image Segmentation mapping ("detr", "DetrForSegmentation"), + ("table-transformer", "TableTransformerForSegmentation"), ] ) @@ -468,6 +470,7 @@ ("conditional_detr", "ConditionalDetrForObjectDetection"), ("deformable_detr", "DeformableDetrForObjectDetection"), ("detr", "DetrForObjectDetection"), + ("table-transformer", "TableTransformerForObjectDetection"), ("yolos", "YolosForObjectDetection"), ] ) diff --git a/src/transformers/models/table_transformer/__init__.py b/src/transformers/models/table_transformer/__init__.py new file mode 100644 index 0000000000000..279e6d3cde7ba --- /dev/null +++ b/src/transformers/models/table_transformer/__init__.py @@ -0,0 +1,69 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_timm_available + + +_import_structure = { + "configuration_table_transformer": [ + "TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", + "TableTransformerConfig", + "TableTransformerOnnxConfig", + ] +} + +try: + if not is_timm_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_table_transformer"] = [ + "TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", + "TableTransformerForObjectDetection", + "TableTransformerModel", + "TableTransformerPreTrainedModel", + ] + + +if TYPE_CHECKING: + from .configuration_table_transformer import ( + TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, + TableTransformerConfig, + TableTransformerOnnxConfig, + ) + + try: + if not is_timm_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_table_transformer import ( + TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, + TableTransformerForObjectDetection, + TableTransformerModel, + TableTransformerPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py new file mode 100644 index 0000000000000..c058946221b4d --- /dev/null +++ b/src/transformers/models/table_transformer/configuration_table_transformer.py @@ -0,0 +1,239 @@ +# coding=utf-8 +# Copyright 2022 Facebook AI Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TABLE_TRANSFORMER model configuration""" + +from collections import OrderedDict +from typing import Mapping + +from packaging import version + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "microsoft/table-transformer-table-detection": ( + "https://huggingface.co/microsoft/table-transformer-table-detection/resolve/main/config.json" + ), +} + + +class TableTransformerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`TableTransformerModel`]. It is used to + instantiate a TABLE_TRANSFORMER model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the TABLE_TRANSFORMER + [microsoft/table-transformer-table-detection](https://huggingface.co/microsoft/table-transformer-table-detection) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + num_queries (`int`, *optional*, defaults to 100): + Number of object queries, i.e. detection slots. This is the maximal number of objects + [`TableTransformerModel`] can detect in a single image. For COCO, we recommend 100 queries. + d_model (`int`, *optional*, defaults to 256): + Dimension of the layers. + encoder_layers (`int`, *optional*, defaults to 6): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + init_xavier_std (`float`, *optional*, defaults to 1): + The scaling factor used for the Xavier initialization gain in the HM Attention map module. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + decoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + auxiliary_loss (`bool`, *optional*, defaults to `False`): + Whether auxiliary decoding losses (loss at each decoder layer) are to be used. + position_embedding_type (`str`, *optional*, defaults to `"sine"`): + Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. + backbone (`str`, *optional*, defaults to `"resnet50"`): + Name of convolutional backbone to use. Supports any convolutional backbone from the timm package. For a + list of all available models, see [this + page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model). + use_pretrained_backbone (`bool`, *optional*, defaults to `True`): + Whether to use pretrained weights for the backbone. + dilation (`bool`, *optional*, defaults to `False`): + Whether to replace stride with dilation in the last convolutional block (DC5). + class_cost (`float`, *optional*, defaults to 1): + Relative weight of the classification error in the Hungarian matching cost. + bbox_cost (`float`, *optional*, defaults to 5): + Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost. + giou_cost (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. + mask_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the Focal loss in the panoptic segmentation loss. + dice_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the DICE/F-1 loss in the panoptic segmentation loss. + bbox_loss_coefficient (`float`, *optional*, defaults to 5): + Relative weight of the L1 bounding box loss in the object detection loss. + giou_loss_coefficient (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss in the object detection loss. + eos_coefficient (`float`, *optional*, defaults to 0.1): + Relative classification weight of the 'no-object' class in the object detection loss. + + Examples: + + ```python + >>> from transformers import TableTransformerModel, TableTransformerConfig + + >>> # Initializing a TABLE_TRANSFORMER microsoft/table-transformer-table-detection style configuration + >>> configuration = TableTransformerConfig() + + >>> # Initializing a model from the microsoft/table-transformer-table-detection style configuration + >>> model = TableTransformerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "table-transformer" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + } + + def __init__( + self, + num_channels=3, + num_queries=100, + max_position_embeddings=1024, + encoder_layers=6, + encoder_ffn_dim=2048, + encoder_attention_heads=8, + decoder_layers=6, + decoder_ffn_dim=2048, + decoder_attention_heads=8, + encoder_layerdrop=0.0, + decoder_layerdrop=0.0, + is_encoder_decoder=True, + activation_function="relu", + d_model=256, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + init_xavier_std=1.0, + classifier_dropout=0.0, + scale_embedding=False, + auxiliary_loss=False, + position_embedding_type="sine", + backbone="resnet50", + use_pretrained_backbone=True, + dilation=False, + class_cost=1, + bbox_cost=5, + giou_cost=2, + mask_loss_coefficient=1, + dice_loss_coefficient=1, + bbox_loss_coefficient=5, + giou_loss_coefficient=2, + eos_coefficient=0.1, + **kwargs + ): + self.num_channels = num_channels + self.num_queries = num_queries + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.init_xavier_std = init_xavier_std + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.num_hidden_layers = encoder_layers + self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + self.auxiliary_loss = auxiliary_loss + self.position_embedding_type = position_embedding_type + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.dilation = dilation + # Hungarian matcher + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + # Loss coefficients + self.mask_loss_coefficient = mask_loss_coefficient + self.dice_loss_coefficient = dice_loss_coefficient + self.bbox_loss_coefficient = bbox_loss_coefficient + self.giou_loss_coefficient = giou_loss_coefficient + self.eos_coefficient = eos_coefficient + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model + + +class TableTransformerOnnxConfig(OnnxConfig): + + torch_onnx_minimum_version = version.parse("1.11") + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}), + ("pixel_mask", {0: "batch"}), + ] + ) + + @property + def atol_for_validation(self) -> float: + return 1e-5 + + @property + def default_onnx_opset(self) -> int: + return 12 diff --git a/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000..2ce56fe626fb7 --- /dev/null +++ b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py @@ -0,0 +1,286 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert TABLE_TRANSFORMER checkpoints.""" + + +import argparse +import json +from collections import OrderedDict +from pathlib import Path + +import torch +from PIL import Image + +import requests +from huggingface_hub import hf_hub_download +from transformers import ( + DetrFeatureExtractor, + TableTransformerConfig, + TableTransformerForObjectDetection, + TableTransformerForSegmentation, +) +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + +# here we list all keys to be renamed (original name on the left, our name on the right) +rename_keys = [] +for i in range(6): + # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms + rename_keys.append( + (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight") + ) + rename_keys.append( + (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias") + ) + rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight")) + rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias")) + rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight")) + rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias")) + rename_keys.append( + (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight") + ) + rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")) + rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")) + rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias")) + # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms + rename_keys.append( + (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight") + ) + rename_keys.append( + (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias") + ) + rename_keys.append( + ( + f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight", + f"decoder.layers.{i}.encoder_attn.out_proj.weight", + ) + ) + rename_keys.append( + ( + f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias", + f"decoder.layers.{i}.encoder_attn.out_proj.bias", + ) + ) + rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight")) + rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias")) + rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight")) + rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias")) + rename_keys.append( + (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight") + ) + rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")) + rename_keys.append( + (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight") + ) + rename_keys.append( + (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias") + ) + rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")) + rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")) + +# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads +rename_keys.extend( + [ + ("input_proj.weight", "input_projection.weight"), + ("input_proj.bias", "input_projection.bias"), + ("query_embed.weight", "query_position_embeddings.weight"), + ("transformer.decoder.norm.weight", "decoder.layernorm.weight"), + ("transformer.decoder.norm.bias", "decoder.layernorm.bias"), + ("class_embed.weight", "class_labels_classifier.weight"), + ("class_embed.bias", "class_labels_classifier.bias"), + ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"), + ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"), + ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"), + ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"), + ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"), + ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"), + ] +) + + +def rename_key(state_dict, old, new): + val = state_dict.pop(old) + state_dict[new] = val + + +def rename_backbone_keys(state_dict): + new_state_dict = OrderedDict() + for key, value in state_dict.items(): + if "backbone.0.body" in key: + new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model") + new_state_dict[new_key] = value + else: + new_state_dict[key] = value + + return new_state_dict + + +def read_in_q_k_v(state_dict, is_panoptic=False): + prefix = "" + if is_panoptic: + prefix = "table_transformer." + + # first: transformer encoder + for i in range(6): + # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] + state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] + state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] + state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] + state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] + state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] + # next: transformer decoder (which is a bit more complex because it also includes cross-attention) + for i in range(6): + # read in weights + bias of input projection layer of self-attention + in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] + state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] + state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] + state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] + state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] + state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] + # read in weights + bias of input projection layer of cross-attention + in_proj_weight_cross_attn = state_dict.pop( + f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight" + ) + in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias") + # next, add query, keys and values (in that order) of cross-attention to the state dict + state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :] + state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256] + state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :] + state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512] + state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :] + state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:] + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + + return im + + +@torch.no_grad() +def convert_table_transformer_checkpoint(model_name, pytorch_dump_folder_path): + """ + Copy/paste/tweak model's weights to our TABLE_TRANSFORMER structure. + """ + + # load default config + config = TableTransformerConfig() + # set backbone and dilation attributes + if "resnet101" in model_name: + config.backbone = "resnet101" + if "dc5" in model_name: + config.dilation = True + is_panoptic = "panoptic" in model_name + if is_panoptic: + config.num_labels = 250 + else: + config.num_labels = 91 + repo_id = "huggingface/label-files" + filename = "coco-detection-id2label.json" + id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) + id2label = {int(k): v for k, v in id2label.items()} + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + + # load feature extractor + format = "coco_panoptic" if is_panoptic else "coco_detection" + feature_extractor = DetrFeatureExtractor(format=format) + + # prepare image + img = prepare_img() + encoding = feature_extractor(images=img, return_tensors="pt") + pixel_values = encoding["pixel_values"] + + logger.info(f"Converting model {model_name}...") + + # load original model from torch hub + table_transformer = torch.hub.load("facebookresearch/table_transformer", model_name, pretrained=True).eval() + state_dict = table_transformer.state_dict() + # rename keys + for src, dest in rename_keys: + if is_panoptic: + src = "table_transformer." + src + rename_key(state_dict, src, dest) + state_dict = rename_backbone_keys(state_dict) + # query, key and value matrices need special treatment + read_in_q_k_v(state_dict, is_panoptic=is_panoptic) + # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them + prefix = "table_transformer.model." if is_panoptic else "model." + for key in state_dict.copy().keys(): + if is_panoptic: + if ( + key.startswith("table_transformer") + and not key.startswith("class_labels_classifier") + and not key.startswith("bbox_predictor") + ): + val = state_dict.pop(key) + state_dict["table_transformer.model" + key[4:]] = val + elif "class_labels_classifier" in key or "bbox_predictor" in key: + val = state_dict.pop(key) + state_dict["table_transformer." + key] = val + elif key.startswith("bbox_attention") or key.startswith("mask_head"): + continue + else: + val = state_dict.pop(key) + state_dict[prefix + key] = val + else: + if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): + val = state_dict.pop(key) + state_dict[prefix + key] = val + # finally, create HuggingFace model and load state dict + model = TableTransformerForSegmentation(config) if is_panoptic else TableTransformerForObjectDetection(config) + model.load_state_dict(state_dict) + model.eval() + # verify our conversion + original_outputs = table_transformer(pixel_values) + outputs = model(pixel_values) + assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4) + assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4) + if is_panoptic: + assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) + + # Save model and feature extractor + logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...") + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + model.save_pretrained(pytorch_dump_folder_path) + feature_extractor.save_pretrained(pytorch_dump_folder_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_name", + default="table_transformer_resnet50", + type=str, + help="Name of the TABLE_TRANSFORMER model you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." + ) + args = parser.parse_args() + convert_table_transformer_checkpoint(args.model_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py new file mode 100644 index 0000000000000..285b87a80ac8c --- /dev/null +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -0,0 +1,1945 @@ +# coding=utf-8 +# Copyright 2022 Facebook AI Research The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch TABLE_TRANSFORMER model.""" + + +import math +import random +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import torch +from torch import Tensor, nn + +from ...activations import ACT2FN +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import torch_int_div +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_scipy_available, + is_timm_available, + logging, + replace_return_docstrings, + requires_backends, +) +from .configuration_table_transformer import TableTransformerConfig + + +if is_scipy_available(): + from scipy.optimize import linear_sum_assignment + +if is_timm_available(): + from timm import create_model + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "TableTransformerConfig" +_CHECKPOINT_FOR_DOC = "microsoft/table-transformer-table-detection" + +TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "microsoft/table-transformer-table-detection", + # See all Table Transformer models at https://huggingface.co/models?filter=table-transformer +] + + +@dataclass +# Copied from transformers.models.detr.modeling_detr.DetrDecoderOutput with DETR->TABLE_TRANSFORMER,Detr->TableTransformer +class TableTransformerDecoderOutput(BaseModelOutputWithCrossAttentions): + """ + Base class for outputs of the TABLE_TRANSFORMER decoder. This class adds one attribute to + BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output + of each decoder layer, each of them gone through a layernorm. This is useful when training the model with auxiliary + decoding losses. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`): + Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a + layernorm. + """ + + intermediate_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +# Copied from transformers.models.detr.modeling_detr.DetrModelOutput with DETR->TABLE_TRANSFORMER,Detr->TableTransformer +class TableTransformerModelOutput(Seq2SeqModelOutput): + """ + Base class for outputs of the TABLE_TRANSFORMER encoder-decoder model. This class adds one attribute to + Seq2SeqModelOutput, namely an optional stack of intermediate decoder activations, i.e. the output of each decoder + layer, each of them gone through a layernorm. This is useful when training the model with auxiliary decoding + losses. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each + layer plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the + weighted average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the + weighted average in the self-attention heads. + intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`): + Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a + layernorm. + """ + + intermediate_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->TableTransformer,DetrFeatureExtractor->DetrFeatureExtractor +class TableTransformerObjectDetectionOutput(ModelOutput): + """ + Output type of [`TableTransformerForObjectDetection`]. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~TableTransformerFeatureExtractor.post_process_object_detection`] to + retrieve the unnormalized bounding boxes. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each + layer plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the + weighted average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the + weighted average in the self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[Dict] = None + logits: torch.FloatTensor = None + pred_boxes: torch.FloatTensor = None + auxiliary_outputs: Optional[List[Dict]] = None + last_hidden_state: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->TableTransformer +class TableTransformerFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super().__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->TableTransformer +def replace_batch_norm(m, name=""): + for attr_str in dir(m): + target_attr = getattr(m, attr_str) + if isinstance(target_attr, nn.BatchNorm2d): + frozen = TableTransformerFrozenBatchNorm2d(target_attr.num_features) + bn = getattr(m, attr_str) + frozen.weight.data.copy_(bn.weight) + frozen.bias.data.copy_(bn.bias) + frozen.running_mean.data.copy_(bn.running_mean) + frozen.running_var.data.copy_(bn.running_var) + setattr(m, attr_str, frozen) + for n, ch in m.named_children(): + replace_batch_norm(ch, n) + + +# Copied from transformers.models.detr.modeling_detr.DetrTimmConvEncoder with Detr->TableTransformer +class TableTransformerTimmConvEncoder(nn.Module): + """ + Convolutional encoder (backbone) from the timm library. + + nn.BatchNorm2d layers are replaced by TableTransformerFrozenBatchNorm2d as defined above. + + """ + + def __init__(self, name: str, dilation: bool, use_pretrained_backbone: bool, num_channels: int = 3): + super().__init__() + + kwargs = {} + if dilation: + kwargs["output_stride"] = 16 + + requires_backends(self, ["timm"]) + + backbone = create_model( + name, + pretrained=use_pretrained_backbone, + features_only=True, + out_indices=(1, 2, 3, 4), + in_chans=num_channels, + **kwargs, + ) + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = self.model.feature_info.channels() + + if "resnet" in name: + for name, parameter in self.model.named_parameters(): + if "layer2" not in name and "layer3" not in name and "layer4" not in name: + parameter.requires_grad_(False) + + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values) + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->TableTransformer +class TableTransformerConvModel(nn.Module): + """ + This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. + """ + + def __init__(self, conv_encoder, position_embedding): + super().__init__() + self.conv_encoder = conv_encoder + self.position_embedding = position_embedding + + def forward(self, pixel_values, pixel_mask): + # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples + out = self.conv_encoder(pixel_values, pixel_mask) + pos = [] + for feature_map, mask in out: + # position encoding + pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype)) + + return out, pos + + +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[int] = None): + """ + Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`. + """ + batch_size, source_len = mask.size() + target_len = target_len if target_len is not None else source_len + + expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_len, source_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min) + + +# Copied from transformers.models.detr.modeling_detr.DetrSinePositionEmbedding with Detr->TableTransformer +class TableTransformerSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.embedding_dim = embedding_dim + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, pixel_values, pixel_mask): + if pixel_mask is None: + raise ValueError("No pixel mask provided") + y_embed = pixel_mask.cumsum(1, dtype=torch.float32) + x_embed = pixel_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale + + dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device) + dim_t = self.temperature ** (2 * torch_int_div(dim_t, 2) / self.embedding_dim) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->TableTransformer +class TableTransformerLearnedPositionEmbedding(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, embedding_dim=256): + super().__init__() + self.row_embeddings = nn.Embedding(50, embedding_dim) + self.column_embeddings = nn.Embedding(50, embedding_dim) + + def forward(self, pixel_values, pixel_mask=None): + height, width = pixel_values.shape[-2:] + width_values = torch.arange(width, device=pixel_values.device) + height_values = torch.arange(height, device=pixel_values.device) + x_emb = self.column_embeddings(width_values) + y_emb = self.row_embeddings(height_values) + pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->TableTransformer +def build_position_encoding(config): + n_steps = config.d_model // 2 + if config.position_embedding_type == "sine": + # TODO find a better way of exposing other arguments + position_embedding = TableTransformerSinePositionEmbedding(n_steps, normalize=True) + elif config.position_embedding_type == "learned": + position_embedding = TableTransformerLearnedPositionEmbedding(n_steps) + else: + raise ValueError(f"Not supported {config.position_embedding_type}") + + return position_embedding + + +# Copied from transformers.models.detr.modeling_detr.DetrAttention with DETR->TABLE_TRANSFORMER,Detr->TableTransformer +class TableTransformerAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + + Here, we add position embeddings to the queries and keys (as explained in the TABLE_TRANSFORMER paper). + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + if self.head_dim * num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[torch.Tensor] = None, + key_value_states: Optional[torch.Tensor] = None, + key_value_position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size, target_len, embed_dim = hidden_states.size() + + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states_original = hidden_states + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + # add key-value position embeddings to the key value states + if key_value_position_embeddings is not None: + key_value_states_original = key_value_states + key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings) + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, batch_size) + value_states = self._shape(self.v_proj(key_value_states_original), -1, batch_size) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, batch_size) + value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size) + + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + source_len = key_states.size(1) + + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len): + raise ValueError( + f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (batch_size, 1, target_len, source_len): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is" + f" {attention_mask.size()}" + ) + attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask + attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(batch_size, target_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +# Copied from transformers.models.detr.modeling_detr.DetrEncoderLayer with Detr->TableTransformer +class TableTransformerEncoderLayer(nn.Module): + def __init__(self, config: TableTransformerConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = TableTransformerAttention( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: torch.Tensor = None, + output_attentions: bool = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + position_embeddings (`torch.FloatTensor`, *optional*): position embeddings, to be added to hidden_states. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if self.training: + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# Copied from transformers.models.detr.modeling_detr.DetrDecoderLayer with Detr->TableTransformer +class TableTransformerDecoderLayer(nn.Module): + def __init__(self, config: TableTransformerConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = TableTransformerAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = TableTransformerAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[torch.Tensor] = None, + query_position_embeddings: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + position_embeddings (`torch.FloatTensor`, *optional*): + position embeddings that are added to the queries and keys + in the cross-attention layer. + query_position_embeddings (`torch.FloatTensor`, *optional*): + position embeddings that are added to the queries and keys + in the self-attention layer. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + position_embeddings=query_position_embeddings, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + position_embeddings=query_position_embeddings, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + key_value_position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead with Detr->TableTransformer +class TableTransformerClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float): + super().__init__() + self.dense = nn.Linear(input_dim, inner_dim) + self.dropout = nn.Dropout(p=pooler_dropout) + self.out_proj = nn.Linear(inner_dim, num_classes) + + def forward(self, hidden_states: torch.Tensor): + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = torch.tanh(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +class TableTransformerPreTrainedModel(PreTrainedModel): + config_class = TableTransformerConfig + base_model_prefix = "model" + main_input_name = "pixel_values" + + def _init_weights(self, module): + std = self.config.init_std + + if isinstance(module, TableTransformerLearnedPositionEmbedding): + nn.init.uniform_(module.row_embeddings.weight) + nn.init.uniform_(module.column_embeddings.weight) + if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, TableTransformerDecoder): + module.gradient_checkpointing = value + + +TABLE_TRANSFORMER_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`TableTransformerConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +TABLE_TRANSFORMER_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. + + Pixel values can be obtained using [`DetrFeatureExtractor`]. See [`DetrFeatureExtractor.__call__`] for + details. + + pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + [What are attention masks?](../glossary#attention-mask) + + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*): + Not used by default. Can be used to mask object queries. + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->TABLE_TRANSFORMER,Detr->TableTransformer +class TableTransformerEncoder(TableTransformerPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`TableTransformerEncoderLayer`]. + + The encoder updates the flattened feature map through multiple self-attention layers. + + Small tweak for TABLE_TRANSFORMER: + + - position_embeddings are added to the forward pass. + + Args: + config: TableTransformerConfig + """ + + def __init__(self, config: TableTransformerConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + self.layers = nn.ModuleList([TableTransformerEncoderLayer(config) for _ in range(config.encoder_layers)]) + + # in the original TABLE_TRANSFORMER, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + position_embeddings=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + + [What are attention masks?](../glossary#attention-mask) + + position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = inputs_embeds + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # expand attention_mask + if attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if self.training and (dropout_probability < self.layerdrop): # skip the layer + layer_outputs = (None, None) + else: + # we add position_embeddings as extra input to the encoder_layer + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +# Copied from transformers.models.detr.modeling_detr.DetrDecoder with DETR->TABLE_TRANSFORMER,Detr->TableTransformer +class TableTransformerDecoder(TableTransformerPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TableTransformerDecoderLayer`]. + + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + + Some small tweaks for TABLE_TRANSFORMER: + + - position_embeddings and query_position_embeddings are added to the forward pass. + - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers. + + Args: + config: TableTransformerConfig + """ + + def __init__(self, config: TableTransformerConfig): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + + self.layers = nn.ModuleList([TableTransformerDecoderLayer(config) for _ in range(config.decoder_layers)]) + # in TABLE_TRANSFORMER, the decoder uses layernorm after the last decoder layer output + self.layernorm = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings=None, + query_position_embeddings=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + The query embeddings that are passed into the decoder. + + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`: + + - 1 for queries that are **not masked**, + - 0 for queries that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected + in `[0, 1]`: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Position embeddings that are added to the queries and keys in each cross-attention layer. + query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + input_shape = inputs_embeds.size()[:-1] + + combined_attention_mask = None + + if attention_mask is not None and combined_attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + combined_attention_mask = combined_attention_mask + _expand_mask( + attention_mask, inputs_embeds.dtype, target_len=input_shape[-1] + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + encoder_attention_mask = _expand_mask( + encoder_attention_mask, inputs_embeds.dtype, target_len=input_shape[-1] + ) + + # optional intermediate hidden states + intermediate = () if self.config.auxiliary_loss else None + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + dropout_probability = random.uniform(0, 1) + if self.training and (dropout_probability < self.layerdrop): + continue + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + combined_attention_mask, + encoder_hidden_states, + encoder_attention_mask, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=combined_attention_mask, + position_embeddings=position_embeddings, + query_position_embeddings=query_position_embeddings, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if self.config.auxiliary_loss: + hidden_states = self.layernorm(hidden_states) + intermediate += (hidden_states,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # finally, apply layernorm + hidden_states = self.layernorm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + # stack intermediate decoder activations + if self.config.auxiliary_loss: + intermediate = torch.stack(intermediate) + + if not return_dict: + return tuple( + v + for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate] + if v is not None + ) + return TableTransformerDecoderOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + intermediate_hidden_states=intermediate, + ) + + +@add_start_docstrings( + """ + The bare TABLE_TRANSFORMER Model (consisting of a backbone and encoder-decoder Transformer) outputting raw + hidden-states without any specific head on top. + """, + TABLE_TRANSFORMER_START_DOCSTRING, +) +# Copied from transformers.models.detr.modeling_detr.DetrModel with DETR->TABLE_TRANSFORMER,Detr->TableTransformer,facebook/detr-resnet-50->microsoft/table-transformer-table-detection +class TableTransformerModel(TableTransformerPreTrainedModel): + def __init__(self, config: TableTransformerConfig): + super().__init__(config) + + # Create backbone + positional encoding + backbone = TableTransformerTimmConvEncoder( + config.backbone, config.dilation, config.use_pretrained_backbone, config.num_channels + ) + position_embeddings = build_position_encoding(config) + self.backbone = TableTransformerConvModel(backbone, position_embeddings) + + # Create projection layer + self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1) + + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model) + + self.encoder = TableTransformerEncoder(config) + self.decoder = TableTransformerDecoder(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def freeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(True) + + @add_start_docstrings_to_model_forward(TABLE_TRANSFORMER_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TableTransformerModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values, + pixel_mask=None, + decoder_attention_mask=None, + encoder_outputs=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Returns: + + Examples: + + ```python + >>> from transformers import TableTransformerFeatureExtractor, TableTransformerModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = TableTransformerFeatureExtractor.from_pretrained( + ... "microsoft/table-transformer-table-detection" + ... ) + >>> model = TableTransformerModel.from_pretrained("microsoft/table-transformer-table-detection") + + >>> # prepare image for the model + >>> inputs = feature_extractor(images=image, return_tensors="pt") + + >>> # forward pass + >>> outputs = model(**inputs) + + >>> # the last hidden states are the final query embeddings of the Transformer decoder + >>> # these are of shape (batch_size, num_queries, hidden_size) + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 100, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), device=device) + + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # pixel_values should be of shape (batch_size, num_channels, height, width) + # pixel_mask should be of shape (batch_size, height, width) + features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + + # get final feature map and downsampled mask + feature_map, mask = features[-1] + + if mask is None: + raise ValueError("Backbone does not return downsampled pixel mask") + + # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + projected_feature_map = self.input_projection(feature_map) + + # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC + # In other words, turn their shape into (batch_size, sequence_length, hidden_size) + flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1) + position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1) + + flattened_mask = mask.flatten(1) + + # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder + # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size) + # flattened_mask is a Tensor of shape (batch_size, heigth*width) + if encoder_outputs is None: + encoder_outputs = self.encoder( + inputs_embeds=flattened_features, + attention_mask=flattened_mask, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output) + query_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1) + queries = torch.zeros_like(query_position_embeddings) + + # decoder outputs consists of (dec_features, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + inputs_embeds=queries, + attention_mask=None, + position_embeddings=position_embeddings, + query_position_embeddings=query_position_embeddings, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=flattened_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return TableTransformerModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + ) + + +@add_start_docstrings( + """ + TABLE_TRANSFORMER Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on + top, for tasks such as COCO detection. + """, + TABLE_TRANSFORMER_START_DOCSTRING, +) +# Copied from transformers.models.detr.modeling_detr.DetrForObjectDetection with DETR->TABLE_TRANSFORMER,Detr->TableTransformer,detr->table_transformer,facebook/detr-resnet-50->microsoft/table-transformer-table-detection +class TableTransformerForObjectDetection(TableTransformerPreTrainedModel): + def __init__(self, config: TableTransformerConfig): + super().__init__(config) + + # TABLE_TRANSFORMER encoder-decoder model + self.model = TableTransformerModel(config) + + # Object detection heads + self.class_labels_classifier = nn.Linear( + config.d_model, config.num_labels + 1 + ) # We add one for the "no object" class + self.bbox_predictor = TableTransformerMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + + # Initialize weights and apply final processing + self.post_init() + + # taken from https://github.com/facebookresearch/table_transformer/blob/master/models/table_transformer.py + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] + + @add_start_docstrings_to_model_forward(TABLE_TRANSFORMER_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TableTransformerObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values, + pixel_mask=None, + decoder_attention_mask=None, + encoder_outputs=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`List[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Returns: + + Examples: + + ```python + >>> from transformers import TableTransformerFeatureExtractor, TableTransformerForObjectDetection + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = TableTransformerFeatureExtractor.from_pretrained( + ... "facebook/table_transformer-resnet-50" + ... ) + >>> model = TableTransformerForObjectDetection.from_pretrained("facebook/table_transformer-resnet-50") + + >>> inputs = feature_extractor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> # convert outputs (bounding boxes and class logits) to COCO API + >>> target_sizes = torch.tensor([image.size[::-1]]) + >>> results = feature_extractor.post_process_object_detection(outputs, target_sizes=target_sizes)[0] + + >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... # let's only keep detections with score > 0.9 + ... if score > 0.9: + ... print( + ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"{round(score.item(), 3)} at location {box}" + ... ) + Detected remote with confidence 0.998 at location [40.16, 70.81, 175.55, 117.98] + Detected remote with confidence 0.996 at location [333.24, 72.55, 368.33, 187.66] + Detected couch with confidence 0.995 at location [-0.02, 1.15, 639.73, 473.76] + Detected cat with confidence 0.999 at location [13.24, 52.05, 314.02, 470.93] + Detected cat with confidence 0.999 at location [345.4, 23.85, 640.37, 368.72] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # First, sent images through TABLE_TRANSFORMER base model to obtain encoder + decoder outputs + outputs = self.model( + pixel_values, + pixel_mask=pixel_mask, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + # class logits + predicted bounding boxes + logits = self.class_labels_classifier(sequence_output) + pred_boxes = self.bbox_predictor(sequence_output).sigmoid() + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + # First: create the matcher + matcher = TableTransformerHungarianMatcher( + class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost + ) + # Second: create the criterion + losses = ["labels", "boxes", "cardinality"] + criterion = TableTransformerLoss( + matcher=matcher, + num_classes=self.config.num_labels, + eos_coef=self.config.eos_coefficient, + losses=losses, + ) + criterion.to(self.device) + # Third: compute the losses, based on outputs and labels + outputs_loss = {} + outputs_loss["logits"] = logits + outputs_loss["pred_boxes"] = pred_boxes + if self.config.auxiliary_loss: + intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4] + outputs_class = self.class_labels_classifier(intermediate) + outputs_coord = self.bbox_predictor(intermediate).sigmoid() + auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) + outputs_loss["auxiliary_outputs"] = auxiliary_outputs + + loss_dict = criterion(outputs_loss, labels) + # Fourth: compute total loss, as a weighted sum of the various losses + weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} + weight_dict["loss_giou"] = self.config.giou_loss_coefficient + if self.config.auxiliary_loss: + aux_weight_dict = {} + for i in range(self.config.decoder_layers - 1): + aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + auxiliary_outputs + outputs + else: + output = (logits, pred_boxes) + outputs + return ((loss, loss_dict) + output) if loss is not None else output + + return TableTransformerObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=outputs.last_hidden_state, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +# Copied from transformers.models.detr.modeling_detr.dice_loss +def dice_loss(inputs, targets, num_boxes): + """ + Compute the DICE loss, similar to generalized IOU for masks + + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs (0 for the negative class and 1 for the positive + class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * (inputs * targets).sum(1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_boxes + + +# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss +def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): + """ + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + + Args: + inputs (`torch.FloatTensor` of arbitrary shape): + The predictions for each example. + targets (`torch.FloatTensor` with the same shape as `inputs`) + A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class + and 1 for the positive class). + alpha (`float`, *optional*, defaults to `0.25`): + Optional weighting factor in the range (0,1) to balance positive vs. negative examples. + gamma (`int`, *optional*, defaults to `2`): + Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. + + Returns: + Loss tensor + """ + prob = inputs.sigmoid() + ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + # add modulating factor + p_t = prob * targets + (1 - prob) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** gamma) + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss + + return loss.mean(1).sum() / num_boxes + + +# Copied from transformers.models.detr.modeling_detr.DetrLoss with Detr->TableTransformer,detr->table_transformer +class TableTransformerLoss(nn.Module): + """ + This class computes the losses for TableTransformerForObjectDetection/TableTransformerForSegmentation. The process + happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2) + we supervise each pair of matched ground-truth / prediction (supervise class and box). + + A note on the `num_classes` argument (copied from original repo in table_transformer.py): "the naming of the + `num_classes` parameter of the criterion is somewhat misleading. It indeed corresponds to `max_obj_id` + 1, where + `max_obj_id` is the maximum id for a class in your dataset. For example, COCO has a `max_obj_id` of 90, so we pass + `num_classes` to be 91. As another example, for a dataset that has a single class with `id` 1, you should pass + `num_classes` to be 2 (`max_obj_id` + 1). For more details on this, check the following discussion + https://github.com/facebookresearch/table_transformer/issues/108#issuecomment-650269223" + + + Args: + matcher (`TableTransformerHungarianMatcher`): + Module able to compute a matching between targets and proposals. + num_classes (`int`): + Number of object categories, omitting the special no-object category. + eos_coef (`float`): + Relative classification weight applied to the no-object category. + losses (`List[str]`): + List of all the losses to be applied. See `get_loss` for a list of all available losses. + """ + + def __init__(self, matcher, num_classes, eos_coef, losses): + super().__init__() + self.matcher = matcher + self.num_classes = num_classes + self.eos_coef = eos_coef + self.losses = losses + empty_weight = torch.ones(self.num_classes + 1) + empty_weight[-1] = self.eos_coef + self.register_buffer("empty_weight", empty_weight) + + # removed logging parameter, which was part of the original implementation + def loss_labels(self, outputs, targets, indices, num_boxes): + """ + Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim + [nb_target_boxes] + """ + if "logits" not in outputs: + raise KeyError("No logits were found in the outputs") + src_logits = outputs["logits"] + + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full( + src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device + ) + target_classes[idx] = target_classes_o + + loss_ce = nn.functional.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) + losses = {"loss_ce": loss_ce} + + return losses + + @torch.no_grad() + def loss_cardinality(self, outputs, targets, indices, num_boxes): + """ + Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. + + This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients. + """ + logits = outputs["logits"] + device = logits.device + target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) + # Count the number of predictions that are NOT "no-object" (which is the last class) + card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) + card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) + losses = {"cardinality_error": card_err} + return losses + + def loss_boxes(self, outputs, targets, indices, num_boxes): + """ + Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. + + Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes + are expected in format (center_x, center_y, w, h), normalized by the image size. + """ + if "pred_boxes" not in outputs: + raise KeyError("No predicted boxes found in outputs") + idx = self._get_src_permutation_idx(indices) + src_boxes = outputs["pred_boxes"][idx] + target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) + + loss_bbox = nn.functional.l1_loss(src_boxes, target_boxes, reduction="none") + + losses = {} + losses["loss_bbox"] = loss_bbox.sum() / num_boxes + + loss_giou = 1 - torch.diag( + generalized_box_iou(center_to_corners_format(src_boxes), center_to_corners_format(target_boxes)) + ) + losses["loss_giou"] = loss_giou.sum() / num_boxes + return losses + + def loss_masks(self, outputs, targets, indices, num_boxes): + """ + Compute the losses related to the masks: the focal loss and the dice loss. + + Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]. + """ + if "pred_masks" not in outputs: + raise KeyError("No predicted masks found in outputs") + + src_idx = self._get_src_permutation_idx(indices) + tgt_idx = self._get_tgt_permutation_idx(indices) + src_masks = outputs["pred_masks"] + src_masks = src_masks[src_idx] + masks = [t["masks"] for t in targets] + # TODO use valid to mask invalid areas due to padding in loss + target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() + target_masks = target_masks.to(src_masks) + target_masks = target_masks[tgt_idx] + + # upsample predictions to the target size + src_masks = nn.functional.interpolate( + src_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False + ) + src_masks = src_masks[:, 0].flatten(1) + + target_masks = target_masks.flatten(1) + target_masks = target_masks.view(src_masks.shape) + losses = { + "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes), + "loss_dice": dice_loss(src_masks, target_masks, num_boxes), + } + return losses + + def _get_src_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) + src_idx = torch.cat([src for (src, _) in indices]) + return batch_idx, src_idx + + def _get_tgt_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) + tgt_idx = torch.cat([tgt for (_, tgt) in indices]) + return batch_idx, tgt_idx + + def get_loss(self, loss, outputs, targets, indices, num_boxes): + loss_map = { + "labels": self.loss_labels, + "cardinality": self.loss_cardinality, + "boxes": self.loss_boxes, + "masks": self.loss_masks, + } + if loss not in loss_map: + raise ValueError(f"Loss {loss} not supported") + return loss_map[loss](outputs, targets, indices, num_boxes) + + def forward(self, outputs, targets): + """ + This performs the loss computation. + + Args: + outputs (`dict`, *optional*): + Dictionary of tensors, see the output specification of the model for the format. + targets (`List[dict]`, *optional*): + List of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the + losses applied, see each loss' doc. + """ + outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"} + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets) + + # Compute the average number of target boxes across all nodes, for normalization purposes + num_boxes = sum(len(t["class_labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + # (Niels): comment out function below, distributed training to be added + # if is_dist_avail_and_initialized(): + # torch.distributed.all_reduce(num_boxes) + # (Niels) in original implementation, num_boxes is divided by get_world_size() + num_boxes = torch.clamp(num_boxes, min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self.losses: + losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if "auxiliary_outputs" in outputs: + for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): + indices = self.matcher(auxiliary_outputs, targets) + for loss in self.losses: + if loss == "masks": + # Intermediate masks losses are too costly to compute, we ignore them. + continue + l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) + l_dict = {k + f"_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + +# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->TableTransformer,detr->table_transformer +class TableTransformerMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/table_transformer/blob/master/models/table_transformer.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +# Copied from transformers.models.detr.modeling_detr.DetrHungarianMatcher with Detr->TableTransformer +class TableTransformerHungarianMatcher(nn.Module): + """ + This class computes an assignment between the targets and the predictions of the network. + + For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more + predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are + un-matched (and thus treated as non-objects). + + Args: + class_cost: + The relative weight of the classification error in the matching cost. + bbox_cost: + The relative weight of the L1 error of the bounding box coordinates in the matching cost. + giou_cost: + The relative weight of the giou loss of the bounding box in the matching cost. + """ + + def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1): + super().__init__() + requires_backends(self, ["scipy"]) + + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + if class_cost == 0 and bbox_cost == 0 and giou_cost == 0: + raise ValueError("All costs of the Matcher can't be 0") + + @torch.no_grad() + def forward(self, outputs, targets): + """ + Args: + outputs (`dict`): + A dictionary that contains at least these entries: + * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates. + targets (`List[dict]`): + A list of targets (len(targets) = batch_size), where each target is a dict containing: + * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of + ground-truth + objects in the target) containing the class labels + * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates. + + Returns: + `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + batch_size, num_queries = outputs["logits"].shape[:2] + + # We flatten to compute the cost matrices in a batch + out_prob = outputs["logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes] + out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] + + # Also concat the target labels and boxes + target_ids = torch.cat([v["class_labels"] for v in targets]) + target_bbox = torch.cat([v["boxes"] for v in targets]) + + # Compute the classification cost. Contrary to the loss, we don't use the NLL, + # but approximate it in 1 - proba[target class]. + # The 1 is a constant that doesn't change the matching, it can be ommitted. + class_cost = -out_prob[:, target_ids] + + # Compute the L1 cost between boxes + bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) + + # Compute the giou cost between boxes + giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) + + # Final cost matrix + cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost + cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() + + sizes = [len(v["boxes"]) for v in targets] + indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] + return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] + + +# Copied from transformers.models.detr.modeling_detr._upcast +def _upcast(t: Tensor) -> Tensor: + # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type + if t.is_floating_point(): + return t if t.dtype in (torch.float32, torch.float64) else t.float() + else: + return t if t.dtype in (torch.int32, torch.int64) else t.int() + + +# Copied from transformers.models.detr.modeling_detr.box_area +def box_area(boxes: Tensor) -> Tensor: + """ + Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. + + Args: + boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): + Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 + < x2` and `0 <= y1 < y2`. + + Returns: + `torch.FloatTensor`: a tensor containing the area for each box. + """ + boxes = _upcast(boxes) + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +# Copied from transformers.models.detr.modeling_detr.box_iou +def box_iou(boxes1, boxes2): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] + inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + iou = inter / union + return iou, union + + +# Copied from transformers.models.detr.modeling_detr.generalized_box_iou +def generalized_box_iou(boxes1, boxes2): + """ + Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. + + Returns: + `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) + """ + # degenerate boxes gives inf / nan results + # so do an early check + if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): + raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") + if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): + raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") + iou, union = box_iou(boxes1, boxes2) + + top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) + bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) + + width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] + area = width_height[:, :, 0] * width_height[:, :, 1] + + return iou - (area - union) / area + + +# Copied from transformers.models.detr.modeling_detr._max_by_axis +def _max_by_axis(the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + +# Copied from transformers.models.detr.modeling_detr.NestedTensor +class NestedTensor(object): + def __init__(self, tensors, mask: Optional[Tensor]): + self.tensors = tensors + self.mask = mask + + def to(self, device): + cast_tensor = self.tensors.to(device) + mask = self.mask + if mask is not None: + cast_mask = mask.to(device) + else: + cast_mask = None + return NestedTensor(cast_tensor, cast_mask) + + def decompose(self): + return self.tensors, self.mask + + def __repr__(self): + return str(self.tensors) + + +# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list +def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): + if tensor_list[0].ndim == 3: + max_size = _max_by_axis([list(img.shape) for img in tensor_list]) + batch_shape = [len(tensor_list)] + max_size + batch_size, num_channels, height, width = batch_shape + dtype = tensor_list[0].dtype + device = tensor_list[0].device + tensor = torch.zeros(batch_shape, dtype=dtype, device=device) + mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) + for img, pad_img, m in zip(tensor_list, tensor, mask): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + m[: img.shape[1], : img.shape[2]] = False + else: + raise ValueError("Only 3-dimensional tensors are supported") + return NestedTensor(tensor, mask) + + +def center_to_corners_format(x): + """ + Converts a PyTorch tensor of bounding boxes of center format (center_x, center_y, width, height) to corners format + (x_0, y_0, x_1, y_1). + """ + x_c, y_c, w, h = x.unbind(-1) + b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] + return torch.stack(b, dim=-1) diff --git a/src/transformers/utils/dummy_timm_and_vision_objects.py b/src/transformers/utils/dummy_timm_and_vision_objects.py index 0696a17d3778a..b4be05ece1af2 100644 --- a/src/transformers/utils/dummy_timm_and_vision_objects.py +++ b/src/transformers/utils/dummy_timm_and_vision_objects.py @@ -87,3 +87,27 @@ class DetrPreTrainedModel(metaclass=DummyObject): def __init__(self, *args, **kwargs): requires_backends(self, ["timm", "vision"]) + + +TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class TableTransformerForObjectDetection(metaclass=DummyObject): + _backends = ["timm", "vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["timm", "vision"]) + + +class TableTransformerModel(metaclass=DummyObject): + _backends = ["timm", "vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["timm", "vision"]) + + +class TableTransformerPreTrainedModel(metaclass=DummyObject): + _backends = ["timm", "vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["timm", "vision"]) diff --git a/tests/models/table_transformer/__init__.py b/tests/models/table_transformer/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py new file mode 100644 index 0000000000000..287045eacd489 --- /dev/null +++ b/tests/models/table_transformer/test_modeling_table_transformer.py @@ -0,0 +1,524 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch TABLE_TRANSFORMER model. """ + + +import inspect +import math +import unittest + +from transformers import TableTransformerConfig, is_timm_available, is_vision_available +from transformers.testing_utils import require_timm, require_vision, slow, torch_device +from transformers.utils import cached_property + +from ...generation.test_generation_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor + + +if is_timm_available(): + import torch + + from transformers import TableTransformerForObjectDetection, TableTransformerModel + + +if is_vision_available(): + from PIL import Image + + from transformers import DetrFeatureExtractor + + +class TableTransformerModelTester: + def __init__( + self, + parent, + batch_size=8, + is_training=True, + use_labels=True, + hidden_size=256, + num_hidden_layers=2, + num_attention_heads=8, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + num_queries=12, + num_channels=3, + min_size=200, + max_size=200, + n_targets=8, + num_labels=91, + ): + self.parent = parent + self.batch_size = batch_size + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.num_queries = num_queries + self.num_channels = num_channels + self.min_size = min_size + self.max_size = max_size + self.n_targets = n_targets + self.num_labels = num_labels + + # we also set the expected seq length for both encoder and decoder + self.encoder_seq_length = math.ceil(self.min_size / 32) * math.ceil(self.max_size / 32) + self.decoder_seq_length = self.num_queries + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size]) + + pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device) + + labels = None + if self.use_labels: + # labels is a list of Dict (each Dict being the labels for a given example in the batch) + labels = [] + for i in range(self.batch_size): + target = {} + target["class_labels"] = torch.randint( + high=self.num_labels, size=(self.n_targets,), device=torch_device + ) + target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device) + target["masks"] = torch.rand(self.n_targets, self.min_size, self.max_size, device=torch_device) + labels.append(target) + + config = self.get_config() + return config, pixel_values, pixel_mask, labels + + def get_config(self): + return TableTransformerConfig( + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + num_queries=self.num_queries, + num_labels=self.num_labels, + ) + + def prepare_config_and_inputs_for_common(self): + config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs() + inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask} + return config, inputs_dict + + def create_and_check_table_transformer_model(self, config, pixel_values, pixel_mask, labels): + model = TableTransformerModel(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size) + ) + + def create_and_check_table_transformer_object_detection_head_model(self, config, pixel_values, pixel_mask, labels): + model = TableTransformerForObjectDetection(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels) + + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + +@require_timm +class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + TableTransformerModel, + TableTransformerForObjectDetection, + ) + if is_timm_available() + else () + ) + is_encoder_decoder = True + test_torchscript = False + test_pruning = False + test_head_masking = False + test_missing_keys = False + + # special case for head models + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ in ["TableTransformerForObjectDetection"]: + labels = [] + for i in range(self.model_tester.batch_size): + target = {} + target["class_labels"] = torch.ones( + size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long + ) + target["boxes"] = torch.ones( + self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float + ) + target["masks"] = torch.ones( + self.model_tester.n_targets, + self.model_tester.min_size, + self.model_tester.max_size, + device=torch_device, + dtype=torch.float, + ) + labels.append(target) + inputs_dict["labels"] = labels + + return inputs_dict + + def setUp(self): + self.model_tester = TableTransformerModelTester(self) + self.config_tester = ConfigTester(self, config_class=TableTransformerConfig, has_text_modality=False) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_table_transformer_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_table_transformer_model(*config_and_inputs) + + def test_table_transformer_object_detection_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_table_transformer_object_detection_head_model(*config_and_inputs) + + @unittest.skip(reason="TABLE_TRANSFORMER does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="TABLE_TRANSFORMER does not have a get_input_embeddings method") + def test_model_common_attributes(self): + pass + + @unittest.skip(reason="TABLE_TRANSFORMER is not a generative model") + def test_generate_without_input_ids(self): + pass + + @unittest.skip(reason="TABLE_TRANSFORMER does not use token embeddings") + def test_resize_tokens_embeddings(self): + pass + + @slow + def test_model_outputs_equivalence(self): + # TODO Niels: fix me! + pass + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + decoder_seq_length = self.model_tester.decoder_seq_length + encoder_seq_length = self.model_tester.encoder_seq_length + decoder_key_length = self.model_tester.decoder_seq_length + encoder_key_length = self.model_tester.encoder_seq_length + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + out_len = len(outputs) + + if self.is_encoder_decoder: + correct_outlen = 5 + + # loss is at first position + if "labels" in inputs_dict: + correct_outlen += 1 # loss is added to beginning + # Object Detection model returns pred_logits and pred_boxes + if model_class.__name__ == "TableTransformerForObjectDetection": + correct_outlen += 2 + + if "past_key_values" in outputs: + correct_outlen += 1 # past_key_values have been returned + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], + ) + + # cross attentions + cross_attentions = outputs.cross_attentions + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + encoder_key_length, + ], + ) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if hasattr(self.model_tester, "num_hidden_states_types"): + added_hidden_states = self.model_tester.num_hidden_states_types + elif self.is_encoder_decoder: + added_hidden_states = 2 + else: + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + + def test_retain_grad_hidden_states_attentions(self): + # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs = model(**inputs) + + output = outputs[0] + + encoder_hidden_states = outputs.encoder_hidden_states[0] + encoder_attentions = outputs.encoder_attentions[0] + encoder_hidden_states.retain_grad() + encoder_attentions.retain_grad() + + decoder_attentions = outputs.decoder_attentions[0] + decoder_attentions.retain_grad() + + cross_attentions = outputs.cross_attentions[0] + cross_attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(encoder_hidden_states.grad) + self.assertIsNotNone(encoder_attentions.grad) + self.assertIsNotNone(decoder_attentions.grad) + self.assertIsNotNone(cross_attentions.grad) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + if model.config.is_encoder_decoder: + expected_arg_names = ["pixel_values", "pixel_mask"] + expected_arg_names.extend( + ["head_mask", "decoder_head_mask", "encoder_outputs"] + if "head_mask" and "decoder_head_mask" in arg_names + else [] + ) + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + else: + expected_arg_names = ["pixel_values", "pixel_mask"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_different_timm_backbone(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # let's pick a random timm backbone + config.backbone = "tf_mobilenetv3_small_075" + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if model_class.__name__ == "TableTransformerForObjectDetection": + expected_shape = ( + self.model_tester.batch_size, + self.model_tester.num_queries, + self.model_tester.num_labels + 1, + ) + self.assertEqual(outputs.logits.shape, expected_shape) + + self.assertTrue(outputs) + + def test_greyscale_images(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # use greyscale pixel values + inputs_dict["pixel_values"] = floats_tensor( + [self.model_tester.batch_size, 1, self.model_tester.min_size, self.model_tester.max_size] + ) + + # let's set num_channels to 1 + config.num_channels = 1 + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + self.assertTrue(outputs) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + configs_no_init.init_xavier_std = 1e9 + + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + if "bbox_attention" in name and "bias" not in name: + self.assertLess( + 100000, + abs(param.data.max().item()), + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + +TOLERANCE = 1e-4 + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_timm +@require_vision +@slow +class TableTransformerModelIntegrationTests(unittest.TestCase): + @cached_property + def default_feature_extractor(self): + return ( + DetrFeatureExtractor.from_pretrained("microsoft/table-transformer-table-detection") + if is_vision_available() + else None + ) + + def test_inference_no_head(self): + model = TableTransformerModel.from_pretrained("microsoft/table-transformer-table-detection").to(torch_device) + + feature_extractor = self.default_feature_extractor + image = prepare_img() + encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device) + + with torch.no_grad(): + outputs = model(**encoding) + + expected_shape = torch.Size((1, 100, 256)) + assert outputs.last_hidden_state.shape == expected_shape + expected_slice = torch.tensor( + [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]] + ).to(torch_device) + self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)) + + def test_inference_object_detection_head(self): + model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-table-detection").to( + torch_device + ) + + feature_extractor = self.default_feature_extractor + image = prepare_img() + encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device) + pixel_values = encoding["pixel_values"].to(torch_device) + pixel_mask = encoding["pixel_mask"].to(torch_device) + + with torch.no_grad(): + outputs = model(pixel_values, pixel_mask) + + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1)) + self.assertEqual(outputs.logits.shape, expected_shape_logits) + expected_slice_logits = torch.tensor( + [[-19.1194, -0.0893, -11.0154], [-17.3640, -1.8035, -14.0219], [-20.0461, -0.5837, -11.1060]] + ).to(torch_device) + self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4)) + + expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) + self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) + expected_slice_boxes = torch.tensor( + [[0.4433, 0.5302, 0.8853], [0.5494, 0.2517, 0.0529], [0.4998, 0.5360, 0.9956]] + ).to(torch_device) + self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4)) diff --git a/utils/check_repo.py b/utils/check_repo.py index 988eb499aa302..6e89d97b9c874 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -46,6 +46,8 @@ # Being in this list is an exception and should **not** be the rule. IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [ # models to ignore for not tested + "TableTransformerEncoder", # Building part of bigger (tested) model. + "TableTransformerDecoder", # Building part of bigger (tested) model. "TimeSeriesTransformerEncoder", # Building part of bigger (tested) model. "TimeSeriesTransformerDecoder", # Building part of bigger (tested) model. "DeformableDetrEncoder", # Building part of bigger (tested) model. From e9c40e72e24c5deef2c6adffb0be76cad8509aeb Mon Sep 17 00:00:00 2001 From: NielsRogge Date: Fri, 14 Oct 2022 08:39:38 +0000 Subject: [PATCH 02/14] Add conversion script --- .../models/auto/feature_extraction_auto.py | 2 +- src/transformers/models/auto/modeling_auto.py | 1 - .../configuration_table_transformer.py | 11 +- ..._original_pytorch_checkpoint_to_pytorch.py | 217 ++++++++++-------- .../modeling_table_transformer.py | 2 +- 5 files changed, 133 insertions(+), 100 deletions(-) diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index 4c4d27a31be6b..18f19c1dd9b19 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -69,7 +69,7 @@ ("speech_to_text", "Speech2TextFeatureExtractor"), ("swin", "ViTFeatureExtractor"), ("swinv2", "ViTFeatureExtractor"), - ("table-transformer", "TableTransformerFeatureExtractor"), + ("table-transformer", "DetrFeatureExtractor"), ("van", "ConvNextFeatureExtractor"), ("videomae", "VideoMAEFeatureExtractor"), ("vilt", "ViltFeatureExtractor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 4531840081b58..3da1dc1790572 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -385,7 +385,6 @@ # Do not add new models here, this class will be deprecated in the future. # Model for Image Segmentation mapping ("detr", "DetrForSegmentation"), - ("table-transformer", "TableTransformerForSegmentation"), ] ) diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py index c058946221b4d..5e4e4790609c2 100644 --- a/src/transformers/models/table_transformer/configuration_table_transformer.py +++ b/src/transformers/models/table_transformer/configuration_table_transformer.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2022 Facebook AI Research and The HuggingFace Inc. team. All rights reserved. +# Copyright The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" TABLE_TRANSFORMER model configuration""" +""" Table Transformer model configuration""" from collections import OrderedDict from typing import Mapping @@ -36,8 +36,8 @@ class TableTransformerConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`TableTransformerModel`]. It is used to - instantiate a TABLE_TRANSFORMER model according to the specified arguments, defining the model architecture. - Instantiating a configuration with the defaults will yield a similar configuration to that of the TABLE_TRANSFORMER + instantiate a Table Transformer model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the Table Transformer [microsoft/table-transformer-table-detection](https://huggingface.co/microsoft/table-transformer-table-detection) architecture. @@ -117,7 +117,7 @@ class TableTransformerConfig(PretrainedConfig): ```python >>> from transformers import TableTransformerModel, TableTransformerConfig - >>> # Initializing a TABLE_TRANSFORMER microsoft/table-transformer-table-detection style configuration + >>> # Initializing a Table Transformer microsoft/table-transformer-table-detection style configuration >>> configuration = TableTransformerConfig() >>> # Initializing a model from the microsoft/table-transformer-table-detection style configuration @@ -133,6 +133,7 @@ class TableTransformerConfig(PretrainedConfig): "num_attention_heads": "encoder_attention_heads", } + # Copied from transformers.models.detr.configuration_detr.DetrConfig.__init__ def __init__( self, num_channels=3, diff --git a/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py index 2ce56fe626fb7..67525fd817d31 100644 --- a/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py @@ -12,25 +12,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Convert TABLE_TRANSFORMER checkpoints.""" +"""Convert Table Transformer checkpoints. + +URL: https://github.com/microsoft/table-transformer +""" import argparse -import json from collections import OrderedDict from pathlib import Path import torch from PIL import Image +from torchvision.transforms import functional as F -import requests from huggingface_hub import hf_hub_download -from transformers import ( - DetrFeatureExtractor, - TableTransformerConfig, - TableTransformerForObjectDetection, - TableTransformerForSegmentation, -) +from transformers import DetrConfig, DetrFeatureExtractor, DetrForObjectDetection from transformers.utils import logging @@ -93,12 +90,14 @@ rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")) rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")) -# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads +# convolutional projection + query embeddings + layernorm of encoder + layernorm of decoder + class and bounding box heads rename_keys.extend( [ ("input_proj.weight", "input_projection.weight"), ("input_proj.bias", "input_projection.bias"), ("query_embed.weight", "query_position_embeddings.weight"), + ("transformer.encoder.norm.weight", "encoder.layernorm.weight"), + ("transformer.encoder.norm.bias", "encoder.layernorm.bias"), ("transformer.decoder.norm.weight", "decoder.layernorm.weight"), ("transformer.decoder.norm.bias", "decoder.layernorm.bias"), ("class_embed.weight", "class_labels_classifier.weight"), @@ -130,10 +129,8 @@ def rename_backbone_keys(state_dict): return new_state_dict -def read_in_q_k_v(state_dict, is_panoptic=False): +def read_in_q_k_v(state_dict): prefix = "" - if is_panoptic: - prefix = "table_transformer." # first: transformer encoder for i in range(6): @@ -173,114 +170,150 @@ def read_in_q_k_v(state_dict, is_panoptic=False): state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:] -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) +def resize(image, checkpoint_url): + width, height = image.size + current_max_size = max(width, height) + target_max_size = 800 if "detection" in checkpoint_url else 1000 + scale = target_max_size / current_max_size + resized_image = image.resize((int(round(scale * width)), int(round(scale * height)))) + + return resized_image + - return im +def normalize(image): + image = F.to_tensor(image) + image = F.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + return image @torch.no_grad() -def convert_table_transformer_checkpoint(model_name, pytorch_dump_folder_path): +def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub): """ - Copy/paste/tweak model's weights to our TABLE_TRANSFORMER structure. + Copy/paste/tweak model's weights to our DETR structure. """ - # load default config - config = TableTransformerConfig() - # set backbone and dilation attributes - if "resnet101" in model_name: - config.backbone = "resnet101" - if "dc5" in model_name: - config.dilation = True - is_panoptic = "panoptic" in model_name - if is_panoptic: - config.num_labels = 250 - else: - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load feature extractor - format = "coco_panoptic" if is_panoptic else "coco_detection" - feature_extractor = DetrFeatureExtractor(format=format) + logger.info("Converting model...") - # prepare image - img = prepare_img() - encoding = feature_extractor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - logger.info(f"Converting model {model_name}...") - - # load original model from torch hub - table_transformer = torch.hub.load("facebookresearch/table_transformer", model_name, pretrained=True).eval() - state_dict = table_transformer.state_dict() + # load original state dict + state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") # rename keys for src, dest in rename_keys: - if is_panoptic: - src = "table_transformer." + src rename_key(state_dict, src, dest) state_dict = rename_backbone_keys(state_dict) # query, key and value matrices need special treatment - read_in_q_k_v(state_dict, is_panoptic=is_panoptic) + read_in_q_k_v(state_dict) # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - prefix = "table_transformer.model." if is_panoptic else "model." + prefix = "model." for key in state_dict.copy().keys(): - if is_panoptic: - if ( - key.startswith("table_transformer") - and not key.startswith("class_labels_classifier") - and not key.startswith("bbox_predictor") - ): - val = state_dict.pop(key) - state_dict["table_transformer.model" + key[4:]] = val - elif "class_labels_classifier" in key or "bbox_predictor" in key: - val = state_dict.pop(key) - state_dict["table_transformer." + key] = val - elif key.startswith("bbox_attention") or key.startswith("mask_head"): - continue - else: - val = state_dict.pop(key) - state_dict[prefix + key] = val - else: - if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): - val = state_dict.pop(key) - state_dict[prefix + key] = val - # finally, create HuggingFace model and load state dict - model = TableTransformerForSegmentation(config) if is_panoptic else TableTransformerForObjectDetection(config) + if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): + val = state_dict.pop(key) + state_dict[prefix + key] = val + # create HuggingFace model and load state dict + config = DetrConfig( + backbone="resnet18", + normalize_before=True, + mask_loss_coefficient=1, + dice_loss_coefficient=1, + ce_loss_coefficient=1, + bbox_loss_coefficient=5, + giou_loss_coefficient=2, + eos_coefficient=0.4, + class_cost=1, + bbox_cost=5, + giou_cost=2, + ) + + if "detection" in checkpoint_url: + config.num_queries = 15 + config.num_labels = 2 + id2label = {0: "table", 1: "table rotated"} + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + else: + config.num_queries = 125 + config.num_labels = 6 + id2label = { + 0: "table", + 1: "table column", + 2: "table row", + 3: "table column header", + 4: "table projected row header", + 5: "table spanning cell", + } + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + + feature_extractor = DetrFeatureExtractor( + format="coco_detection", max_size=800 if "detection" in checkpoint_url else 1000 + ) + model = DetrForObjectDetection(config) model.load_state_dict(state_dict) model.eval() + # verify our conversion - original_outputs = table_transformer(pixel_values) + filename = "example_pdf.png" if "detection" in checkpoint_url else "example_table.png" + file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename=filename) + image = Image.open(file_path).convert("RGB") + pixel_values = normalize(resize(image, checkpoint_url)).unsqueeze(0) + outputs = model(pixel_values) - assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4) - assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4) - if is_panoptic: - assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) - # Save model and feature extractor - logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - feature_extractor.save_pretrained(pytorch_dump_folder_path) + if "detection" in checkpoint_url: + expected_shape = (1, 15, 3) + expected_logits = torch.tensor( + [[-6.7897, -16.9985, 6.7937], [-8.0186, -22.2192, 6.9677], [-7.3117, -21.0708, 7.4055]] + ) + expected_boxes = torch.tensor([[0.4867, 0.1767, 0.6732], [0.6718, 0.4479, 0.3830], [0.4716, 0.1760, 0.6364]]) + + else: + expected_shape = (1, 125, 7) + expected_logits = torch.tensor( + [[-18.1430, -8.3214, 4.8274], [-18.4685, -7.1361, -4.2667], [-26.3693, -9.3429, -4.9962]] + ) + expected_boxes = torch.tensor([[0.4983, 0.5595, 0.9440], [0.4916, 0.6315, 0.5954], [0.6108, 0.8637, 0.1135]]) + + assert outputs.logits.shape == expected_shape + assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4) + assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4) + print("Looks ok!") + + if pytorch_dump_folder_path is not None: + # Save model and feature extractor + logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...") + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + model.save_pretrained(pytorch_dump_folder_path) + feature_extractor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + # Push model to HF hub + logger.info("Pushing model to the hub...") + model_name = ( + "nielsr/detr-table-detection" + if "detection" in checkpoint_url + else "nielsr/detr-table-structure-recognition" + ) + model.push_to_hub(model_name) + feature_extractor.push_to_hub(model_name) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--model_name", - default="table_transformer_resnet50", + "--checkpoint_url", + default="https://pubtables1m.blob.core.windows.net/model/pubtables1m_detection_detr_r18.pth", type=str, - help="Name of the TABLE_TRANSFORMER model you'd like to convert.", + choices=[ + "https://pubtables1m.blob.core.windows.net/model/pubtables1m_detection_detr_r18.pth", + "https://pubtables1m.blob.core.windows.net/model/pubtables1m_structure_detr_r18.pth", + ], + help="URL of the Table Transformer checkpoint you'd like to convert.", ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." ) + parser.add_argument( + "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." + ) args = parser.parse_args() - convert_table_transformer_checkpoint(args.model_name, args.pytorch_dump_folder_path) + convert_table_transformer_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 285b87a80ac8c..90d1823086f16 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2022 Facebook AI Research The HuggingFace Inc. team. All rights reserved. +# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 1efeded2067d78d6befd638827e615252988e683 Mon Sep 17 00:00:00 2001 From: NielsRogge Date: Fri, 14 Oct 2022 08:55:18 +0000 Subject: [PATCH 03/14] Make conversion work --- ..._original_pytorch_checkpoint_to_pytorch.py | 7 ++-- .../modeling_table_transformer.py | 34 +++++++++++-------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py index 67525fd817d31..5991498c08331 100644 --- a/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py @@ -27,7 +27,7 @@ from torchvision.transforms import functional as F from huggingface_hub import hf_hub_download -from transformers import DetrConfig, DetrFeatureExtractor, DetrForObjectDetection +from transformers import DetrFeatureExtractor, TableTransformerConfig, TableTransformerForObjectDetection from transformers.utils import logging @@ -209,9 +209,8 @@ def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_pat val = state_dict.pop(key) state_dict[prefix + key] = val # create HuggingFace model and load state dict - config = DetrConfig( + config = TableTransformerConfig( backbone="resnet18", - normalize_before=True, mask_loss_coefficient=1, dice_loss_coefficient=1, ce_loss_coefficient=1, @@ -246,7 +245,7 @@ def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_pat feature_extractor = DetrFeatureExtractor( format="coco_detection", max_size=800 if "detection" in checkpoint_url else 1000 ) - model = DetrForObjectDetection(config) + model = TableTransformerForObjectDetection(config) model.load_state_dict(state_dict) model.eval() diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 90d1823086f16..706ecd13b52c7 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch TABLE_TRANSFORMER model.""" +""" PyTorch Table Transformer model.""" import math @@ -544,8 +544,8 @@ def forward( return attn_output, attn_weights_reshaped -# Copied from transformers.models.detr.modeling_detr.DetrEncoderLayer with Detr->TableTransformer class TableTransformerEncoderLayer(nn.Module): + # Copied from transformers.models.detr.modeling_detr.DetrEncoderLayer.__init__ with Detr->TableTransformer def __init__(self, config: TableTransformerConfig): super().__init__() self.embed_dim = config.d_model @@ -581,6 +581,8 @@ def forward( returned tensors for more detail. """ residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weights = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, @@ -590,9 +592,10 @@ def forward( hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states - hidden_states = self.self_attn_layer_norm(hidden_states) residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) @@ -600,7 +603,6 @@ def forward( hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states - hidden_states = self.final_layer_norm(hidden_states) if self.training: if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): @@ -615,8 +617,8 @@ def forward( return outputs -# Copied from transformers.models.detr.modeling_detr.DetrDecoderLayer with Detr->TableTransformer class TableTransformerDecoderLayer(nn.Module): + # Copied from transformers.models.detr.modeling_detr.DetrDecoderLayer.__init__ with Detr->TableTransformer def __init__(self, config: TableTransformerConfig): super().__init__() self.embed_dim = config.d_model @@ -675,6 +677,7 @@ def forward( returned tensors for more detail. """ residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) # Self Attention hidden_states, self_attn_weights = self.self_attn( @@ -686,13 +689,13 @@ def forward( hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states - hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) # Cross-Attention Block cross_attn_weights = None if encoder_hidden_states is not None: - residual = hidden_states - hidden_states, cross_attn_weights = self.encoder_attn( hidden_states=hidden_states, position_embeddings=query_position_embeddings, @@ -704,16 +707,16 @@ def forward( hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states - hidden_states = self.encoder_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) # Fully Connected - residual = hidden_states hidden_states = self.activation_fn(self.fc1(hidden_states)) hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states - hidden_states = self.final_layer_norm(hidden_states) outputs = (hidden_states,) @@ -824,7 +827,6 @@ def _set_gradient_checkpointing(self, module, value=False): """ -# Copied from transformers.models.detr.modeling_detr.DetrEncoder with DETR->TABLE_TRANSFORMER,Detr->TableTransformer class TableTransformerEncoder(TableTransformerPreTrainedModel): """ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a @@ -832,7 +834,7 @@ class TableTransformerEncoder(TableTransformerPreTrainedModel): The encoder updates the flattened feature map through multiple self-attention layers. - Small tweak for TABLE_TRANSFORMER: + Small tweak for Table Transformer: - position_embeddings are added to the forward pass. @@ -848,7 +850,7 @@ def __init__(self, config: TableTransformerConfig): self.layers = nn.ModuleList([TableTransformerEncoderLayer(config) for _ in range(config.encoder_layers)]) - # in the original TABLE_TRANSFORMER, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default + self.layernorm = nn.LayerNorm(config.d_model) # Initialize weights and apply final processing self.post_init() @@ -903,7 +905,7 @@ def forward( encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None - for i, encoder_layer in enumerate(self.layers): + for encoder_layer in self.layers: if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) @@ -927,6 +929,8 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) + hidden_states = self.layernorm(hidden_states) + if not return_dict: return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( From ae81123a7a331d48f7ed10952c87465ea8d5e6ee Mon Sep 17 00:00:00 2001 From: NielsRogge Date: Fri, 14 Oct 2022 09:25:21 +0000 Subject: [PATCH 04/14] Upload checkpoints --- .../source/en/model_doc/table-transformer.mdx | 3 +- ..._original_pytorch_checkpoint_to_pytorch.py | 4 +- .../modeling_table_transformer.py | 1 + .../test_modeling_table_transformer.py | 82 +++++++------------ 4 files changed, 33 insertions(+), 57 deletions(-) diff --git a/docs/source/en/model_doc/table-transformer.mdx b/docs/source/en/model_doc/table-transformer.mdx index 0ed63f462b12f..4f627e4d00518 100644 --- a/docs/source/en/model_doc/table-transformer.mdx +++ b/docs/source/en/model_doc/table-transformer.mdx @@ -32,7 +32,8 @@ special customization for these tasks.* Tips: - The authors released 2 models, one for table detection in documents, one for table structure recognition (the task of recognizing the individual rows, columns etc. in a table). -- Usage of these models is identical to other object detectors in the library, such as [`DetrForObjectDetection`]. +- One can use the [`AutoFeatureExtractor`] API to prepare images and optional targets for the model. +- Usage of Table Transformer models is identical to other object detectors in the library, such as [`DetrForObjectDetection`]. Refer to [these notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR). This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/table-transformer). diff --git a/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py index 5991498c08331..a53bd9e03d80d 100644 --- a/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py @@ -287,9 +287,9 @@ def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_pat # Push model to HF hub logger.info("Pushing model to the hub...") model_name = ( - "nielsr/detr-table-detection" + "microsoft/table-transformer-detection" if "detection" in checkpoint_url - else "nielsr/detr-table-structure-recognition" + else "microsoft/table-transformer-structure-recognition" ) model.push_to_hub(model_name) feature_extractor.push_to_hub(model_name) diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 706ecd13b52c7..999370d6ae8e1 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -1939,6 +1939,7 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): return NestedTensor(tensor, mask) +# Copied from transformers.models.detr.modeling_detr.center_to_corners_format def center_to_corners_format(x): """ Converts a PyTorch tensor of bounding boxes of center format (center_x, center_y, width, height) to corners format diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py index 287045eacd489..ea79b8cc1e8e8 100644 --- a/tests/models/table_transformer/test_modeling_table_transformer.py +++ b/tests/models/table_transformer/test_modeling_table_transformer.py @@ -12,16 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Testing suite for the PyTorch TABLE_TRANSFORMER model. """ +""" Testing suite for the PyTorch Table Transformer model. """ import inspect import math import unittest +from huggingface_hub import hf_hub_download from transformers import TableTransformerConfig, is_timm_available, is_vision_available from transformers.testing_utils import require_timm, require_vision, slow, torch_device -from transformers.utils import cached_property from ...generation.test_generation_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -37,7 +37,7 @@ if is_vision_available(): from PIL import Image - from transformers import DetrFeatureExtractor + from transformers import AutoFeatureExtractor class TableTransformerModelTester: @@ -212,19 +212,19 @@ def test_table_transformer_object_detection_head_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_table_transformer_object_detection_head_model(*config_and_inputs) - @unittest.skip(reason="TABLE_TRANSFORMER does not use inputs_embeds") + @unittest.skip(reason="Table Transformer does not use inputs_embeds") def test_inputs_embeds(self): pass - @unittest.skip(reason="TABLE_TRANSFORMER does not have a get_input_embeddings method") + @unittest.skip(reason="Table Transformer does not have a get_input_embeddings method") def test_model_common_attributes(self): pass - @unittest.skip(reason="TABLE_TRANSFORMER is not a generative model") + @unittest.skip(reason="Table Transformer is not a generative model") def test_generate_without_input_ids(self): pass - @unittest.skip(reason="TABLE_TRANSFORMER does not use token embeddings") + @unittest.skip(reason="Table Transformer does not use token embeddings") def test_resize_tokens_embeddings(self): pass @@ -470,55 +470,29 @@ def prepare_img(): @require_vision @slow class TableTransformerModelIntegrationTests(unittest.TestCase): - @cached_property - def default_feature_extractor(self): - return ( - DetrFeatureExtractor.from_pretrained("microsoft/table-transformer-table-detection") - if is_vision_available() - else None - ) - - def test_inference_no_head(self): - model = TableTransformerModel.from_pretrained("microsoft/table-transformer-table-detection").to(torch_device) + def test_table_detection(self): + feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/table-transformer-detection") + model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection") + model.to(torch_device) - feature_extractor = self.default_feature_extractor - image = prepare_img() - encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device) + file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png") + image = Image.open(file_path).convert("RGB") + inputs = feature_extractor(image, return_tensors="pt").to(torch_device) + # forward pass with torch.no_grad(): - outputs = model(**encoding) - - expected_shape = torch.Size((1, 100, 256)) - assert outputs.last_hidden_state.shape == expected_shape - expected_slice = torch.tensor( - [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]] - ).to(torch_device) - self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)) - - def test_inference_object_detection_head(self): - model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-table-detection").to( - torch_device - ) + outputs = model(**inputs) - feature_extractor = self.default_feature_extractor - image = prepare_img() - encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device) - pixel_values = encoding["pixel_values"].to(torch_device) - pixel_mask = encoding["pixel_mask"].to(torch_device) + expected_shape = (1, 15, 3) + self.assertEqual(outputs.logits.shape, expected_shape) - with torch.no_grad(): - outputs = model(pixel_values, pixel_mask) - - expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1)) - self.assertEqual(outputs.logits.shape, expected_shape_logits) - expected_slice_logits = torch.tensor( - [[-19.1194, -0.0893, -11.0154], [-17.3640, -1.8035, -14.0219], [-20.0461, -0.5837, -11.1060]] - ).to(torch_device) - self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4)) - - expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) - self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) - expected_slice_boxes = torch.tensor( - [[0.4433, 0.5302, 0.8853], [0.5494, 0.2517, 0.0529], [0.4998, 0.5360, 0.9956]] - ).to(torch_device) - self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4)) + expected_logits = torch.tensor( + [[-6.7329, -16.9590, 6.7447], [-8.0038, -22.3071, 6.9288], [-7.2445, -20.9855, 7.3465]], + device=torch_device, + ) + self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)) + + expected_boxes = torch.tensor( + [[0.4868, 0.1764, 0.6729], [0.6674, 0.4621, 0.3864], [0.4720, 0.1757, 0.6362]], device=torch_device + ) + self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-3)) From 89b47ef8a82f1d72d3704aa7e405ea446b9d45da Mon Sep 17 00:00:00 2001 From: NielsRogge Date: Fri, 14 Oct 2022 09:47:49 +0000 Subject: [PATCH 05/14] Add final fixes --- .../source/en/model_doc/table-transformer.mdx | 2 +- .../modeling_table_transformer.py | 52 ++++++++----------- utils/documentation_tests.txt | 1 + 3 files changed, 24 insertions(+), 31 deletions(-) diff --git a/docs/source/en/model_doc/table-transformer.mdx b/docs/source/en/model_doc/table-transformer.mdx index 4f627e4d00518..dabad00d20609 100644 --- a/docs/source/en/model_doc/table-transformer.mdx +++ b/docs/source/en/model_doc/table-transformer.mdx @@ -31,7 +31,7 @@ special customization for these tasks.* Tips: -- The authors released 2 models, one for table detection in documents, one for table structure recognition (the task of recognizing the individual rows, columns etc. in a table). +- The authors released 2 models, one for [table detection](https://huggingface.co/microsoft/table-transformer-detection) in documents, one for [table structure recognition](https://huggingface.co/microsoft/table-transformer-structure-recognition) (the task of recognizing the individual rows, columns etc. in a table). - One can use the [`AutoFeatureExtractor`] API to prepare images and optional targets for the model. - Usage of Table Transformer models is identical to other object detectors in the library, such as [`DetrForObjectDetection`]. Refer to [these notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR). diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 999370d6ae8e1..38f181afbccc3 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -49,10 +49,10 @@ logger = logging.get_logger(__name__) _CONFIG_FOR_DOC = "TableTransformerConfig" -_CHECKPOINT_FOR_DOC = "microsoft/table-transformer-table-detection" +_CHECKPOINT_FOR_DOC = "microsoft/table-transformer-detection" TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "microsoft/table-transformer-table-detection", + "microsoft/table-transformer-detection", # See all Table Transformer models at https://huggingface.co/models?filter=table-transformer ] @@ -1122,13 +1122,13 @@ def custom_forward(*inputs): @add_start_docstrings( """ - The bare TABLE_TRANSFORMER Model (consisting of a backbone and encoder-decoder Transformer) outputting raw + The bare Table Transformer Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without any specific head on top. """, TABLE_TRANSFORMER_START_DOCSTRING, ) -# Copied from transformers.models.detr.modeling_detr.DetrModel with DETR->TABLE_TRANSFORMER,Detr->TableTransformer,facebook/detr-resnet-50->microsoft/table-transformer-table-detection class TableTransformerModel(TableTransformerPreTrainedModel): + # Copied from transformers.models.detr.modeling_detr.DetrModel.__init__ with Detr->TableTransformer def __init__(self, config: TableTransformerConfig): super().__init__(config) @@ -1184,17 +1184,15 @@ def forward( Examples: ```python - >>> from transformers import TableTransformerFeatureExtractor, TableTransformerModel + >>> from transformers import AutoFeatureExtractor, TableTransformerModel + >>> from huggingface_hub import hf_hub_download >>> from PIL import Image - >>> import requests - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) + >>> file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png") + >>> image = Image.open(file_path).convert("RGB") - >>> feature_extractor = TableTransformerFeatureExtractor.from_pretrained( - ... "microsoft/table-transformer-table-detection" - ... ) - >>> model = TableTransformerModel.from_pretrained("microsoft/table-transformer-table-detection") + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/table-transformer-detection") + >>> model = TableTransformerModel.from_pretrained("microsoft/table-transformer-detection") >>> # prepare image for the model >>> inputs = feature_extractor(images=image, return_tensors="pt") @@ -1206,7 +1204,7 @@ def forward( >>> # these are of shape (batch_size, num_queries, hidden_size) >>> last_hidden_states = outputs.last_hidden_state >>> list(last_hidden_states.shape) - [1, 100, 256] + [1, 15, 256] ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -1295,17 +1293,17 @@ def forward( @add_start_docstrings( """ - TABLE_TRANSFORMER Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on + Table Transformer Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks such as COCO detection. """, TABLE_TRANSFORMER_START_DOCSTRING, ) -# Copied from transformers.models.detr.modeling_detr.DetrForObjectDetection with DETR->TABLE_TRANSFORMER,Detr->TableTransformer,detr->table_transformer,facebook/detr-resnet-50->microsoft/table-transformer-table-detection class TableTransformerForObjectDetection(TableTransformerPreTrainedModel): + # Copied from transformers.models.detr.modeling_detr.DetrForObjectDetection.__init__ with Detr->TableTransformer def __init__(self, config: TableTransformerConfig): super().__init__(config) - # TABLE_TRANSFORMER encoder-decoder model + # DETR encoder-decoder model self.model = TableTransformerModel(config) # Object detection heads @@ -1319,8 +1317,8 @@ def __init__(self, config: TableTransformerConfig): # Initialize weights and apply final processing self.post_init() - # taken from https://github.com/facebookresearch/table_transformer/blob/master/models/table_transformer.py @torch.jit.unused + # Copied from transformers.models.detr.modeling_detr.DetrForObjectDetection._set_aux_loss def _set_aux_loss(self, outputs_class, outputs_coord): # this is a workaround to make torchscript happy, as torchscript # doesn't support dictionary with non-homogeneous values, such @@ -1354,18 +1352,16 @@ def forward( Examples: ```python - >>> from transformers import TableTransformerFeatureExtractor, TableTransformerForObjectDetection + >>> from huggingface_hub import hf_hub_download + >>> from transformers import AutoFeatureExtractor, TableTransformerForObjectDetection >>> import torch >>> from PIL import Image - >>> import requests - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) + >>> file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png") + >>> image = Image.open(file_path).convert("RGB") - >>> feature_extractor = TableTransformerFeatureExtractor.from_pretrained( - ... "facebook/table_transformer-resnet-50" - ... ) - >>> model = TableTransformerForObjectDetection.from_pretrained("facebook/table_transformer-resnet-50") + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/table-transformer-detection") + >>> model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection") >>> inputs = feature_extractor(images=image, return_tensors="pt") >>> outputs = model(**inputs) @@ -1382,11 +1378,7 @@ def forward( ... f"Detected {model.config.id2label[label.item()]} with confidence " ... f"{round(score.item(), 3)} at location {box}" ... ) - Detected remote with confidence 0.998 at location [40.16, 70.81, 175.55, 117.98] - Detected remote with confidence 0.996 at location [333.24, 72.55, 368.33, 187.66] - Detected couch with confidence 0.995 at location [-0.02, 1.15, 639.73, 473.76] - Detected cat with confidence 0.999 at location [13.24, 52.05, 314.02, 470.93] - Detected cat with confidence 0.999 at location [345.4, 23.85, 640.37, 368.72] + Detected table with confidence 1.0 at location [202.1, 210.59, 1119.22, 385.09] ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index e1a33d70cafa3..a36b3f6de8c6e 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -115,6 +115,7 @@ src/transformers/models/segformer/modeling_tf_segformer.py src/transformers/models/swin/configuration_swin.py src/transformers/models/swin/modeling_swin.py src/transformers/models/swinv2/configuration_swinv2.py +src/transformers/models/table_transformer/modeling_table_transformer.py src/transformers/models/time_series_transformer/configuration_time_series_transformer.py src/transformers/models/time_series_transformer/modeling_time_series_transformer.py src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py From 2a77f95c1ae88111e460e379fcce500c5e2fa87f Mon Sep 17 00:00:00 2001 From: NielsRogge Date: Mon, 17 Oct 2022 11:08:24 +0000 Subject: [PATCH 06/14] Revert changes of conditional and deformable detr --- docs/source/en/model_doc/deformable_detr.mdx | 2 +- .../modeling_conditional_detr.py | 30 ++++++++++--------- src/transformers/models/detr/modeling_detr.py | 4 +++ 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/docs/source/en/model_doc/deformable_detr.mdx b/docs/source/en/model_doc/deformable_detr.mdx index 84d213bd4ea6c..08cbbb0aa0218 100644 --- a/docs/source/en/model_doc/deformable_detr.mdx +++ b/docs/source/en/model_doc/deformable_detr.mdx @@ -23,7 +23,7 @@ The abstract from the paper is the following: Tips: -- One can use [`DeformableDetrFeatureExtractor`] to prepare images (and optional targets) for the model. +- One can use the [`AutoFeatureExtractor`] API to prepare images (and optional targets) for the model. This will instantiate a [`DetrFeatureExtractor`] behind the scenes. - Training Deformable DETR is equivalent to training the original [DETR](detr) model. Demo notebooks can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR). Tensor: # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type if t.is_floating_point(): @@ -2611,7 +2614,6 @@ def center_to_corners_format(x): return torch.stack(b, dim=-1) -# Copied from transformers.models.detr.modeling_detr._max_by_axis def _max_by_axis(the_list): # type: (List[List[int]]) -> List[int] maxes = the_list[0] diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index fbe92538aee8f..50252329257c3 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -33,6 +33,7 @@ add_start_docstrings_to_model_forward, is_scipy_available, is_timm_available, + is_vision_available, logging, replace_return_docstrings, requires_backends, @@ -43,6 +44,9 @@ if is_scipy_available(): from scipy.optimize import linear_sum_assignment +if is_vision_available(): + from .feature_extraction_detr import center_to_corners_format + if is_timm_available(): from timm import create_model From c7a33178a14ef6a8b751bfd02bce9862a30a271e Mon Sep 17 00:00:00 2001 From: NielsRogge Date: Mon, 17 Oct 2022 11:14:18 +0000 Subject: [PATCH 07/14] Fix toctree, add and remove copied from --- docs/source/en/_toctree.yml | 2 +- .../models/table_transformer/configuration_table_transformer.py | 1 + .../models/table_transformer/modeling_table_transformer.py | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index d9607c07b42c0..59d355f0a5b5b 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -412,7 +412,7 @@ title: Swin Transformer - local: model_doc/swinv2 title: Swin Transformer V2 - - local: model_doc/table_transformer + - local: model_doc/table-transformer title: Table Transformer - local: model_doc/van title: VAN diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py index 5e4e4790609c2..0d35b335ab066 100644 --- a/src/transformers/models/table_transformer/configuration_table_transformer.py +++ b/src/transformers/models/table_transformer/configuration_table_transformer.py @@ -218,6 +218,7 @@ def hidden_size(self) -> int: return self.d_model +# Copied from transformers.models.detr.configuration_detr.DetrOnnxConfig class TableTransformerOnnxConfig(OnnxConfig): torch_onnx_minimum_version = version.parse("1.11") diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 38f181afbccc3..ff04489fb9d62 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -1931,7 +1931,6 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): return NestedTensor(tensor, mask) -# Copied from transformers.models.detr.modeling_detr.center_to_corners_format def center_to_corners_format(x): """ Converts a PyTorch tensor of bounding boxes of center format (center_x, center_y, width, height) to corners format From ef74a88546564ab2011b23c8457b623fa36536dd Mon Sep 17 00:00:00 2001 From: NielsRogge Date: Mon, 17 Oct 2022 11:18:24 +0000 Subject: [PATCH 08/14] Use model type --- README.md | 2 +- README_es.md | 2 +- README_ko.md | 2 +- README_zh-hans.md | 2 +- README_zh-hant.md | 2 +- docs/source/en/index.mdx | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 5f0173c6cc640..c291d5a461dd0 100644 --- a/README.md +++ b/README.md @@ -375,7 +375,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. -1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table_transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. +1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). diff --git a/README_es.md b/README_es.md index 8c64b19a2ca5e..8e6ad7d902a37 100644 --- a/README_es.md +++ b/README_es.md @@ -375,7 +375,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. -1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table_transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. +1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). diff --git a/README_ko.md b/README_ko.md index 14bad0088fb5f..4f0c005148a01 100644 --- a/README_ko.md +++ b/README_ko.md @@ -325,7 +325,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. -1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table_transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. +1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). diff --git a/README_zh-hans.md b/README_zh-hans.md index 279e4853bd819..129ab5d7ae874 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -349,7 +349,7 @@ conda install -c huggingface transformers 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (来自 Microsoft) 伴随论文 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 由 Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 发布。 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。 -1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table_transformer)** (来自 Microsoft Research) 伴随论文 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 由 Brandon Smock, Rohith Pesala, Robin Abraham 发布。 +1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table-transformer)** (来自 Microsoft Research) 伴随论文 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 由 Brandon Smock, Rohith Pesala, Robin Abraham 发布。 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). diff --git a/README_zh-hant.md b/README_zh-hant.md index 3cc42138c2674..5d7292b03cc02 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -361,7 +361,7 @@ conda install -c huggingface transformers 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. -1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table_transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. +1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index 164d6afab4aaa..04f82259d7ce0 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -164,7 +164,7 @@ The documentation is organized into five sections: 1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. 1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. -1. **[Table Transformer](model_doc/table_transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. +1. **[Table Transformer](model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. 1. **[Time Series Transformer](model_doc/time_series_transformer)** (from HuggingFace). From e6c8b16da11950d59ad49aad2bf7db11bd27ea20 Mon Sep 17 00:00:00 2001 From: NielsRogge Date: Mon, 17 Oct 2022 11:26:06 +0000 Subject: [PATCH 09/14] Improve docs --- docs/source/en/model_doc/table-transformer.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/model_doc/table-transformer.mdx b/docs/source/en/model_doc/table-transformer.mdx index dabad00d20609..2665e2b462b73 100644 --- a/docs/source/en/model_doc/table-transformer.mdx +++ b/docs/source/en/model_doc/table-transformer.mdx @@ -32,8 +32,8 @@ special customization for these tasks.* Tips: - The authors released 2 models, one for [table detection](https://huggingface.co/microsoft/table-transformer-detection) in documents, one for [table structure recognition](https://huggingface.co/microsoft/table-transformer-structure-recognition) (the task of recognizing the individual rows, columns etc. in a table). -- One can use the [`AutoFeatureExtractor`] API to prepare images and optional targets for the model. -- Usage of Table Transformer models is identical to other object detectors in the library, such as [`DetrForObjectDetection`]. Refer to [these notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR). +- One can use the [`AutoFeatureExtractor`] API to prepare images and optional targets for the model. This will load a [`DetrFeatureExtractor`] behind the scenes. +- A demo notebook for the Table Transformer can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Table Transformer). This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/table-transformer). From c73afd608ed4692565b282237ac29df2f46cd2b3 Mon Sep 17 00:00:00 2001 From: NielsRogge Date: Mon, 17 Oct 2022 12:39:09 +0000 Subject: [PATCH 10/14] Improve code example --- .../modeling_table_transformer.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index ff04489fb9d62..daadfdc65bb92 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -1368,16 +1368,16 @@ def forward( >>> # convert outputs (bounding boxes and class logits) to COCO API >>> target_sizes = torch.tensor([image.size[::-1]]) - >>> results = feature_extractor.post_process_object_detection(outputs, target_sizes=target_sizes)[0] + >>> results = feature_extractor.post_process_object_detection( + ... outputs, threshold=0.9, target_sizes=target_sizes + ... )[0] >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): ... box = [round(i, 2) for i in box.tolist()] - ... # let's only keep detections with score > 0.9 - ... if score > 0.9: - ... print( - ... f"Detected {model.config.id2label[label.item()]} with confidence " - ... f"{round(score.item(), 3)} at location {box}" - ... ) + ... print( + ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"{round(score.item(), 3)} at location {box}" + ... ) Detected table with confidence 1.0 at location [202.1, 210.59, 1119.22, 385.09] ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict From 496f9eb021da2918eaa5308e36f00b13317ed127 Mon Sep 17 00:00:00 2001 From: NielsRogge Date: Tue, 18 Oct 2022 10:11:30 +0000 Subject: [PATCH 11/14] Update copies --- .../modeling_conditional_detr.py | 4 -- src/transformers/models/detr/modeling_detr.py | 4 -- .../modeling_table_transformer.py | 58 +++++++++---------- 3 files changed, 29 insertions(+), 37 deletions(-) diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index 7067c0917a987..e2017135776b6 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -33,7 +33,6 @@ add_start_docstrings_to_model_forward, is_scipy_available, is_timm_available, - is_vision_available, logging, replace_return_docstrings, requires_backends, @@ -44,9 +43,6 @@ if is_scipy_available(): from scipy.optimize import linear_sum_assignment -if is_vision_available(): - from .feature_extraction_conditional_detr import center_to_corners_format - if is_timm_available(): from timm import create_model diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 50252329257c3..fbe92538aee8f 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -33,7 +33,6 @@ add_start_docstrings_to_model_forward, is_scipy_available, is_timm_available, - is_vision_available, logging, replace_return_docstrings, requires_backends, @@ -44,9 +43,6 @@ if is_scipy_available(): from scipy.optimize import linear_sum_assignment -if is_vision_available(): - from .feature_extraction_detr import center_to_corners_format - if is_timm_available(): from timm import create_model diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index daadfdc65bb92..ee96071661647 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -1557,16 +1557,16 @@ def loss_labels(self, outputs, targets, indices, num_boxes): """ if "logits" not in outputs: raise KeyError("No logits were found in the outputs") - src_logits = outputs["logits"] + source_logits = outputs["logits"] - idx = self._get_src_permutation_idx(indices) + idx = self._get_source_permutation_idx(indices) target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) target_classes = torch.full( - src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device + source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device ) target_classes[idx] = target_classes_o - loss_ce = nn.functional.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) + loss_ce = nn.functional.cross_entropy(source_logits.transpose(1, 2), target_classes, self.empty_weight) losses = {"loss_ce": loss_ce} return losses @@ -1596,17 +1596,17 @@ def loss_boxes(self, outputs, targets, indices, num_boxes): """ if "pred_boxes" not in outputs: raise KeyError("No predicted boxes found in outputs") - idx = self._get_src_permutation_idx(indices) - src_boxes = outputs["pred_boxes"][idx] + idx = self._get_source_permutation_idx(indices) + source_boxes = outputs["pred_boxes"][idx] target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) - loss_bbox = nn.functional.l1_loss(src_boxes, target_boxes, reduction="none") + loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none") losses = {} losses["loss_bbox"] = loss_bbox.sum() / num_boxes loss_giou = 1 - torch.diag( - generalized_box_iou(center_to_corners_format(src_boxes), center_to_corners_format(target_boxes)) + generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)) ) losses["loss_giou"] = loss_giou.sum() / num_boxes return losses @@ -1620,41 +1620,41 @@ def loss_masks(self, outputs, targets, indices, num_boxes): if "pred_masks" not in outputs: raise KeyError("No predicted masks found in outputs") - src_idx = self._get_src_permutation_idx(indices) - tgt_idx = self._get_tgt_permutation_idx(indices) - src_masks = outputs["pred_masks"] - src_masks = src_masks[src_idx] + source_idx = self._get_source_permutation_idx(indices) + target_idx = self._get_target_permutation_idx(indices) + source_masks = outputs["pred_masks"] + source_masks = source_masks[source_idx] masks = [t["masks"] for t in targets] # TODO use valid to mask invalid areas due to padding in loss target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() - target_masks = target_masks.to(src_masks) - target_masks = target_masks[tgt_idx] + target_masks = target_masks.to(source_masks) + target_masks = target_masks[target_idx] # upsample predictions to the target size - src_masks = nn.functional.interpolate( - src_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False + source_masks = nn.functional.interpolate( + source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False ) - src_masks = src_masks[:, 0].flatten(1) + source_masks = source_masks[:, 0].flatten(1) target_masks = target_masks.flatten(1) - target_masks = target_masks.view(src_masks.shape) + target_masks = target_masks.view(source_masks.shape) losses = { - "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes), - "loss_dice": dice_loss(src_masks, target_masks, num_boxes), + "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes), + "loss_dice": dice_loss(source_masks, target_masks, num_boxes), } return losses - def _get_src_permutation_idx(self, indices): + def _get_source_permutation_idx(self, indices): # permute predictions following indices - batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) - src_idx = torch.cat([src for (src, _) in indices]) - return batch_idx, src_idx + batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) + source_idx = torch.cat([source for (source, _) in indices]) + return batch_idx, source_idx - def _get_tgt_permutation_idx(self, indices): + def _get_target_permutation_idx(self, indices): # permute targets following indices - batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) - tgt_idx = torch.cat([tgt for (_, tgt) in indices]) - return batch_idx, tgt_idx + batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) + target_idx = torch.cat([target for (_, target) in indices]) + return batch_idx, target_idx def get_loss(self, loss, outputs, targets, indices, num_boxes): loss_map = { @@ -1675,7 +1675,7 @@ def forward(self, outputs, targets): outputs (`dict`, *optional*): Dictionary of tensors, see the output specification of the model for the format. targets (`List[dict]`, *optional*): - List of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the + List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the losses applied, see each loss' doc. """ outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"} From 8a7080e96b3be6763331a9e41e1a5a51cd2a7c60 Mon Sep 17 00:00:00 2001 From: NielsRogge Date: Tue, 18 Oct 2022 10:14:28 +0000 Subject: [PATCH 12/14] Add copied formt --- .../models/table_transformer/modeling_table_transformer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index ee96071661647..f3d61bb969151 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -1931,11 +1931,12 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): return NestedTensor(tensor, mask) +# Copied from transformers.models.detr.modeling_detr.center_to_corners_format def center_to_corners_format(x): """ Converts a PyTorch tensor of bounding boxes of center format (center_x, center_y, width, height) to corners format (x_0, y_0, x_1, y_1). """ - x_c, y_c, w, h = x.unbind(-1) - b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] + center_x, center_y, width, height = x.unbind(-1) + b = [(center_x - 0.5 * width), (center_y - 0.5 * height), (center_x + 0.5 * width), (center_y + 0.5 * height)] return torch.stack(b, dim=-1) From 749d564d38c12c139781f32d4ef1e86cb1b9e8db Mon Sep 17 00:00:00 2001 From: NielsRogge Date: Tue, 18 Oct 2022 10:24:37 +0000 Subject: [PATCH 13/14] Don't update conditional detr --- .../modeling_conditional_detr.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index e2017135776b6..a5ba056f5d3a0 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -2170,6 +2170,7 @@ def forward(self, q, k, mask: Optional[Tensor] = None): return weights +# Copied from transformers.models.detr.modeling_detr.dice_loss def dice_loss(inputs, targets, num_boxes): """ Compute the DICE loss, similar to generalized IOU for masks @@ -2189,26 +2190,28 @@ def dice_loss(inputs, targets, num_boxes): return loss.sum() / num_boxes +# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): """ Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. Args: - inputs: A float tensor of arbitrary shape. - The predictions for each example. - targets: A float tensor with the same shape as inputs. Stores the binary - classification label for each element in inputs (0 for the negative class and 1 for the positive - class). - alpha: (optional) Weighting factor in range (0,1) to balance - positive vs negative examples. Default = -1 (no weighting). - gamma: Exponent of the modulating factor (1 - p_t) to - balance easy vs hard examples. + inputs (`torch.FloatTensor` of arbitrary shape): + The predictions for each example. + targets (`torch.FloatTensor` with the same shape as `inputs`) + A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class + and 1 for the positive class). + alpha (`float`, *optional*, defaults to `0.25`): + Optional weighting factor in the range (0,1) to balance positive vs. negative examples. + gamma (`int`, *optional*, defaults to `2`): + Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. Returns: Loss tensor """ prob = inputs.sigmoid() ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + # add modulating factor p_t = prob * targets + (1 - prob) * (1 - targets) loss = ce_loss * ((1 - p_t) ** gamma) @@ -2529,9 +2532,7 @@ def forward(self, outputs, targets): return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] -# below: bounding box utilities taken from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py - - +# Copied from transformers.models.detr.modeling_detr._upcast def _upcast(t: Tensor) -> Tensor: # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type if t.is_floating_point(): @@ -2610,6 +2611,7 @@ def center_to_corners_format(x): return torch.stack(b, dim=-1) +# Copied from transformers.models.detr.modeling_detr._max_by_axis def _max_by_axis(the_list): # type: (List[List[int]]) -> List[int] maxes = the_list[0] From 8023ce59c7738a09f95ca1c1e65ce992c069ea95 Mon Sep 17 00:00:00 2001 From: NielsRogge Date: Tue, 18 Oct 2022 10:27:12 +0000 Subject: [PATCH 14/14] Don't update deformable detr --- docs/source/en/model_doc/deformable_detr.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/deformable_detr.mdx b/docs/source/en/model_doc/deformable_detr.mdx index 08cbbb0aa0218..84d213bd4ea6c 100644 --- a/docs/source/en/model_doc/deformable_detr.mdx +++ b/docs/source/en/model_doc/deformable_detr.mdx @@ -23,7 +23,7 @@ The abstract from the paper is the following: Tips: -- One can use the [`AutoFeatureExtractor`] API to prepare images (and optional targets) for the model. This will instantiate a [`DetrFeatureExtractor`] behind the scenes. +- One can use [`DeformableDetrFeatureExtractor`] to prepare images (and optional targets) for the model. - Training Deformable DETR is equivalent to training the original [DETR](detr) model. Demo notebooks can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR).