From be3bc75ed39effea9f1a3cb4f39810ca369c9c47 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 15 Jun 2022 11:24:47 -0700 Subject: [PATCH] normalize keys_to_ignore --- src/transformers/models/bart/modeling_bart.py | 4 ++-- .../modeling_bigbird_pegasus.py | 2 +- .../models/blenderbot/modeling_blenderbot.py | 6 +++--- .../modeling_blenderbot_small.py | 6 +++--- .../models/convbert/modeling_convbert.py | 2 +- .../models/electra/modeling_electra.py | 2 +- src/transformers/models/gpt2/modeling_gpt2.py | 2 +- .../models/gpt_neo/modeling_gpt_neo.py | 4 ++-- src/transformers/models/gptj/modeling_gptj.py | 4 ++-- .../models/imagegpt/modeling_imagegpt.py | 2 +- src/transformers/models/led/modeling_led.py | 6 +++--- .../models/longt5/modeling_longt5.py | 16 ++++++++-------- .../models/m2m_100/modeling_m2m_100.py | 6 +++--- .../models/marian/modeling_marian.py | 6 +++--- .../models/mbart/modeling_mbart.py | 6 +++--- src/transformers/models/mt5/modeling_mt5.py | 18 +++++++++--------- src/transformers/models/opt/modeling_opt.py | 4 ++-- .../models/pegasus/modeling_pegasus.py | 8 ++++---- .../models/plbart/modeling_plbart.py | 6 +++--- .../models/roformer/modeling_roformer.py | 4 ++-- .../speech_to_text/modeling_speech_to_text.py | 4 ++-- src/transformers/models/t5/modeling_t5.py | 16 ++++++++-------- .../models/transfo_xl/modeling_transfo_xl.py | 2 +- 23 files changed, 68 insertions(+), 68 deletions(-) diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 595f719ba0125..477a82b07b7e6 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -497,7 +497,7 @@ class BartPretrainedModel(PreTrainedModel): config_class = BartConfig base_model_prefix = "model" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_unexpected = [r"encoder\.version", r"decoder\.version"] + _keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"] def _init_weights(self, module): std = self.config.init_std @@ -1272,7 +1272,7 @@ def forward( ) class BartForConditionalGeneration(BartPretrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head\.weight"] + _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head.weight"] def __init__(self, config: BartConfig): super().__init__(config) diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 0f7bc7f599f8b..b42eb803493e1 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -2476,7 +2476,7 @@ def forward( # Copied from transformers.models.bart.modeling_bart.BartForConditionalGeneration with Bart->BigBirdPegasus, BART->BIGBIRD_PEGASUS class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head\.weight"] + _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head.weight"] def __init__(self, config: BigBirdPegasusConfig): super().__init__(config) diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index 574f4c8731e71..f5b34ed8baa7e 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -1226,9 +1226,9 @@ class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel): base_model_prefix = "model" _keys_to_ignore_on_load_missing = [ r"final_logits_bias", - r"encoder\.version", - r"decoder\.version", - r"lm_head\.weight", + r"encoder.version", + r"decoder.version", + r"lm_head.weight", ] def __init__(self, config: BlenderbotConfig): diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index c65409be553bf..ac2051b322f37 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -1208,9 +1208,9 @@ class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel): base_model_prefix = "model" _keys_to_ignore_on_load_missing = [ r"final_logits_bias", - r"encoder\.version", - r"decoder\.version", - r"lm_head\.weight", + r"encoder.version", + r"decoder.version", + r"lm_head.weight", ] def __init__(self, config: BlenderbotSmallConfig): diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py index 4fbe71fb55f23..9884d32aca7ec 100755 --- a/src/transformers/models/convbert/modeling_convbert.py +++ b/src/transformers/models/convbert/modeling_convbert.py @@ -251,7 +251,7 @@ class ConvBertPreTrainedModel(PreTrainedModel): base_model_prefix = "convbert" supports_gradient_checkpointing = True authorized_missing_keys = [r"position_ids"] - authorized_unexpected_keys = [r"convbert\.embeddings_project\.weight", r"convbert\.embeddings_project\.bias"] + authorized_unexpected_keys = [r"convbert.embeddings_project.weight", r"convbert.embeddings_project.bias"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index f397af906290a..3f488fbcf5648 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -670,7 +670,7 @@ class ElectraPreTrainedModel(PreTrainedModel): base_model_prefix = "electra" supports_gradient_checkpointing = True _keys_to_ignore_on_load_missing = [r"position_ids"] - _keys_to_ignore_on_load_unexpected = [r"electra\.embeddings_project\.weight", r"electra\.embeddings_project\.bias"] + _keys_to_ignore_on_load_unexpected = [r"electra.embeddings_project.weight", r"electra.embeddings_project.bias"] # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights def _init_weights(self, module): diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index b71e5b3cbad7e..b5872be2815cb 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -1328,7 +1328,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> GPT2_START_DOCSTRING, ) class GPT2ForSequenceClassification(GPT2PreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"] + _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index 58f0993238a25..4e507d8d85980 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -661,7 +661,7 @@ def custom_forward(*inputs): class GPTNeoForCausalLM(GPTNeoPreTrainedModel): _keys_to_ignore_on_load_missing = [ r"h\.\d+\.attn\.masked_bias", - r"lm_head\.weight", + r"lm_head.weight", r"h\.\d+\.attn\.attention\.bias", ] _keys_to_ignore_on_save = [r"lm_head.weight"] @@ -812,7 +812,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> GPT_NEO_START_DOCSTRING, ) class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"] + _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 0ce12b329ca87..fed2ee12a8c95 100755 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -890,7 +890,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> GPTJ_START_DOCSTRING, ) class GPTJForSequenceClassification(GPTJPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head\.weight"] + _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"] def __init__(self, config): super().__init__(config) @@ -1017,7 +1017,7 @@ def forward( GPTJ_START_DOCSTRING, ) class GPTJForQuestionAnswering(GPTJPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head\.weight"] + _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"] def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py index c51dada0ed729..4a18c64a13dd2 100755 --- a/src/transformers/models/imagegpt/modeling_imagegpt.py +++ b/src/transformers/models/imagegpt/modeling_imagegpt.py @@ -1087,7 +1087,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> IMAGEGPT_START_DOCSTRING, ) class ImageGPTForImageClassification(ImageGPTPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"] + _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] def __init__(self, config: ImageGPTConfig): super().__init__(config) diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index ca14de32b336e..6ac0bfccb1966 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -2298,9 +2298,9 @@ class LEDForConditionalGeneration(LEDPreTrainedModel): base_model_prefix = "led" _keys_to_ignore_on_load_missing = [ r"final_logits_bias", - r"encoder\.version", - r"decoder\.version", - r"lm_head\.weight", + r"encoder.version", + r"decoder.version", + r"lm_head.weight", ] def __init__(self, config: LEDConfig): diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index 9b7d2c62c4134..cd6c91a7e8c1e 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -1721,11 +1721,11 @@ def custom_forward(*inputs): ) class LongT5Model(LongT5PreTrainedModel): _keys_to_ignore_on_load_missing = [ - r"encoder\.embed_tokens\.weight", - r"decoder\.embed_tokens\.weight", + r"encoder.embed_tokens.weight", + r"decoder.embed_tokens.weight", ] _keys_to_ignore_on_load_unexpected = [ - r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight", + r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", ] def __init__(self, config: LongT5Config): @@ -1874,12 +1874,12 @@ def forward( @add_start_docstrings("""LONGT5 Model with a `language modeling` head on top.""", LONGT5_START_DOCSTRING) class LongT5ForConditionalGeneration(LongT5PreTrainedModel): _keys_to_ignore_on_load_missing = [ - r"encoder\.embed_tokens\.weight", - r"decoder\.embed_tokens\.weight", - r"lm_head\.weight", + r"encoder.embed_tokens.weight", + r"decoder.embed_tokens.weight", + r"lm_head.weight", ] _keys_to_ignore_on_load_unexpected = [ - r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight", + r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", ] def __init__(self, config: LongT5Config): @@ -2116,7 +2116,7 @@ def _reorder_cache(self, past, beam_idx): ) class LongT5EncoderModel(LongT5PreTrainedModel): authorized_missing_keys = [ - r"encoder\.embed_tokens\.weight", + r"encoder.embed_tokens.weight", ] def __init__(self, config: LongT5Config): diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py index 5ced761677e36..90de52d4c351e 100755 --- a/src/transformers/models/m2m_100/modeling_m2m_100.py +++ b/src/transformers/models/m2m_100/modeling_m2m_100.py @@ -1241,9 +1241,9 @@ def forward( class M2M100ForConditionalGeneration(M2M100PreTrainedModel): base_model_prefix = "model" _keys_to_ignore_on_load_missing = [ - r"encoder\.version", - r"decoder\.version", - r"lm_head\.weight", + r"encoder.version", + r"decoder.version", + r"lm_head.weight", r"model.encoder.embed_positions.weights", r"model.decoder.embed_positions.weights", ] diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index 04a6c2d83f5fb..0dc30ed0b476c 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -1272,9 +1272,9 @@ class MarianMTModel(MarianPreTrainedModel): base_model_prefix = "model" _keys_to_ignore_on_load_missing = [ r"final_logits_bias", - r"encoder\.version", - r"decoder\.version", - r"lm_head\.weight", + r"encoder.version", + r"decoder.version", + r"lm_head.weight", r"embed_positions", ] diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index d7f4958a8d311..d342d5fcbf3b4 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -1263,9 +1263,9 @@ class MBartForConditionalGeneration(MBartPreTrainedModel): base_model_prefix = "model" _keys_to_ignore_on_load_missing = [ r"final_logits_bias", - r"encoder\.version", - r"decoder\.version", - r"lm_head\.weight", + r"encoder.version", + r"decoder.version", + r"lm_head.weight", ] def __init__(self, config: MBartConfig): diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py index 314198c69a9a4..8c19a63eded3c 100644 --- a/src/transformers/models/mt5/modeling_mt5.py +++ b/src/transformers/models/mt5/modeling_mt5.py @@ -49,13 +49,13 @@ class MT5Model(T5Model): model_type = "mt5" config_class = MT5Config _keys_to_ignore_on_load_missing = [ - r"encoder\.embed_tokens\.weight", - r"decoder\.embed_tokens\.weight", - r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight", + r"encoder.embed_tokens.weight", + r"decoder.embed_tokens.weight", + r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", ] _keys_to_ignore_on_save = [ - r"encoder\.embed_tokens\.weight", - r"decoder\.embed_tokens\.weight", + r"encoder.embed_tokens.weight", + r"decoder.embed_tokens.weight", ] @@ -84,10 +84,10 @@ class MT5ForConditionalGeneration(T5ForConditionalGeneration): model_type = "mt5" config_class = MT5Config _keys_to_ignore_on_load_missing = [ - r"encoder\.embed_tokens\.weight", + r"encoder.embed_tokens.weight", ] _keys_to_ignore_on_save = [ - r"encoder\.embed_tokens\.weight", + r"encoder.embed_tokens.weight", ] @@ -112,8 +112,8 @@ class MT5EncoderModel(T5EncoderModel): model_type = "mt5" config_class = MT5Config _keys_to_ignore_on_load_missing = [ - r"encoder\.embed_tokens\.weight", + r"encoder.embed_tokens.weight", ] _keys_to_ignore_on_save = [ - r"encoder\.embed_tokens\.weight", + r"encoder.embed_tokens.weight", ] diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py index 8de0d1c3c23e1..6db58a82d61a3 100644 --- a/src/transformers/models/opt/modeling_opt.py +++ b/src/transformers/models/opt/modeling_opt.py @@ -382,7 +382,7 @@ class OPTPreTrainedModel(PreTrainedModel): base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["OPTDecoderLayer"] - _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] + _keys_to_ignore_on_load_unexpected = [r"decoder.version"] def _init_weights(self, module): std = self.config.init_std @@ -780,7 +780,7 @@ def forward( class OPTForCausalLM(OPTPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head\.weight"] + _keys_to_ignore_on_load_missing = [r"lm_head.weight"] def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index 51620bbf36469..25a8676d6f1e4 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -1290,10 +1290,10 @@ class PegasusForConditionalGeneration(PegasusPreTrainedModel): base_model_prefix = "model" _keys_to_ignore_on_load_missing = [ r"final_logits_bias", - r"encoder\.version", - r"decoder\.version", - r"lm_head\.weight", - r"embed_positions\.weight", + r"encoder.version", + r"decoder.version", + r"lm_head.weight", + r"embed_positions.weight", ] def __init__(self, config: PegasusConfig): diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py index 97bf620a9c63c..7ca17146f3c5f 100755 --- a/src/transformers/models/plbart/modeling_plbart.py +++ b/src/transformers/models/plbart/modeling_plbart.py @@ -1235,9 +1235,9 @@ class PLBartForConditionalGeneration(PLBartPreTrainedModel): base_model_prefix = "model" _keys_to_ignore_on_load_missing = [ r"final_logits_bias", - r"encoder\.version", - r"decoder\.version", - r"lm_head\.weight", + r"encoder.version", + r"decoder.version", + r"lm_head.weight", ] def __init__(self, config: PLBartConfig): diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py index 738df51119224..353b1b39217ae 100644 --- a/src/transformers/models/roformer/modeling_roformer.py +++ b/src/transformers/models/roformer/modeling_roformer.py @@ -699,8 +699,8 @@ class RoFormerPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _keys_to_ignore_on_load_missing = [] _keys_to_ignore_on_load_unexpected = [ - r"roformer\.embeddings_project\.weight", - r"roformer\.embeddings_project\.bias", + r"roformer.embeddings_project.weight", + r"roformer.embeddings_project.bias", ] def _init_weights(self, module): diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index 623a6b5910d71..78fac2cac1ecc 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -1252,8 +1252,8 @@ def forward( class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel): base_model_prefix = "model" _keys_to_ignore_on_load_missing = [ - r"encoder\.version", - r"decoder\.version", + r"encoder.version", + r"decoder.version", r"model.encoder.embed_positions.weights", r"model.decoder.embed_positions.weights", ] diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index bdf3026dac66f..b974ad4b2003f 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -1266,11 +1266,11 @@ def custom_forward(*inputs): ) class T5Model(T5PreTrainedModel): _keys_to_ignore_on_load_missing = [ - r"encoder\.embed_tokens\.weight", - r"decoder\.embed_tokens\.weight", + r"encoder.embed_tokens.weight", + r"decoder.embed_tokens.weight", ] _keys_to_ignore_on_load_unexpected = [ - r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight", + r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", ] def __init__(self, config: T5Config): @@ -1455,12 +1455,12 @@ def forward( @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) class T5ForConditionalGeneration(T5PreTrainedModel): _keys_to_ignore_on_load_missing = [ - r"encoder\.embed_tokens\.weight", - r"decoder\.embed_tokens\.weight", - r"lm_head\.weight", + r"encoder.embed_tokens.weight", + r"decoder.embed_tokens.weight", + r"lm_head.weight", ] _keys_to_ignore_on_load_unexpected = [ - r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight", + r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", ] def __init__(self, config: T5Config): @@ -1749,7 +1749,7 @@ def _reorder_cache(self, past, beam_idx): ) class T5EncoderModel(T5PreTrainedModel): authorized_missing_keys = [ - r"encoder\.embed_tokens\.weight", + r"encoder.embed_tokens.weight", ] def __init__(self, config: T5Config): diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py index 1f8f40e455c62..7986aa7af9b61 100644 --- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py +++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py @@ -1198,7 +1198,7 @@ def _reorder_cache(mems: List[torch.Tensor], beam_idx: torch.Tensor) -> List[tor TRANSFO_XL_START_DOCSTRING, ) class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"] + _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] def __init__(self, config): super().__init__(config)