Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

normalize keys_to_ignore #17722

Merged
merged 1 commit into from Jun 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/transformers/models/bart/modeling_bart.py
Expand Up @@ -497,7 +497,7 @@ class BartPretrainedModel(PreTrainedModel):
config_class = BartConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_unexpected = [r"encoder\.version", r"decoder\.version"]
_keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"]

def _init_weights(self, module):
std = self.config.init_std
Expand Down Expand Up @@ -1272,7 +1272,7 @@ def forward(
)
class BartForConditionalGeneration(BartPretrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head\.weight"]
_keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head.weight"]

def __init__(self, config: BartConfig):
super().__init__(config)
Expand Down
Expand Up @@ -2476,7 +2476,7 @@ def forward(
# Copied from transformers.models.bart.modeling_bart.BartForConditionalGeneration with Bart->BigBirdPegasus, BART->BIGBIRD_PEGASUS
class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head\.weight"]
_keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head.weight"]

def __init__(self, config: BigBirdPegasusConfig):
super().__init__(config)
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/blenderbot/modeling_blenderbot.py
Expand Up @@ -1226,9 +1226,9 @@ class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"final_logits_bias",
r"encoder\.version",
r"decoder\.version",
r"lm_head\.weight",
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
]

def __init__(self, config: BlenderbotConfig):
Expand Down
Expand Up @@ -1208,9 +1208,9 @@ class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"final_logits_bias",
r"encoder\.version",
r"decoder\.version",
r"lm_head\.weight",
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
]

def __init__(self, config: BlenderbotSmallConfig):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/convbert/modeling_convbert.py
Expand Up @@ -251,7 +251,7 @@ class ConvBertPreTrainedModel(PreTrainedModel):
base_model_prefix = "convbert"
supports_gradient_checkpointing = True
authorized_missing_keys = [r"position_ids"]
authorized_unexpected_keys = [r"convbert\.embeddings_project\.weight", r"convbert\.embeddings_project\.bias"]
authorized_unexpected_keys = [r"convbert.embeddings_project.weight", r"convbert.embeddings_project.bias"]

def _init_weights(self, module):
"""Initialize the weights"""
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/electra/modeling_electra.py
Expand Up @@ -670,7 +670,7 @@ class ElectraPreTrainedModel(PreTrainedModel):
base_model_prefix = "electra"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
_keys_to_ignore_on_load_unexpected = [r"electra\.embeddings_project\.weight", r"electra\.embeddings_project\.bias"]
_keys_to_ignore_on_load_unexpected = [r"electra.embeddings_project.weight", r"electra.embeddings_project.bias"]

# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
def _init_weights(self, module):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/gpt2/modeling_gpt2.py
Expand Up @@ -1328,7 +1328,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
GPT2_START_DOCSTRING,
)
class GPT2ForSequenceClassification(GPT2PreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]

def __init__(self, config):
super().__init__(config)
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/gpt_neo/modeling_gpt_neo.py
Expand Up @@ -661,7 +661,7 @@ def custom_forward(*inputs):
class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"h\.\d+\.attn\.masked_bias",
r"lm_head\.weight",
r"lm_head.weight",
r"h\.\d+\.attn\.attention\.bias",
]
_keys_to_ignore_on_save = [r"lm_head.weight"]
Expand Down Expand Up @@ -812,7 +812,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
GPT_NEO_START_DOCSTRING,
)
class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]

def __init__(self, config):
super().__init__(config)
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/gptj/modeling_gptj.py
Expand Up @@ -890,7 +890,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
GPTJ_START_DOCSTRING,
)
class GPTJForSequenceClassification(GPTJPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head\.weight"]
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]

def __init__(self, config):
super().__init__(config)
Expand Down Expand Up @@ -1017,7 +1017,7 @@ def forward(
GPTJ_START_DOCSTRING,
)
class GPTJForQuestionAnswering(GPTJPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head\.weight"]
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]

def __init__(self, config):
super().__init__(config)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/imagegpt/modeling_imagegpt.py
Expand Up @@ -1087,7 +1087,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
IMAGEGPT_START_DOCSTRING,
)
class ImageGPTForImageClassification(ImageGPTPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]

def __init__(self, config: ImageGPTConfig):
super().__init__(config)
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/led/modeling_led.py
Expand Up @@ -2298,9 +2298,9 @@ class LEDForConditionalGeneration(LEDPreTrainedModel):
base_model_prefix = "led"
_keys_to_ignore_on_load_missing = [
r"final_logits_bias",
r"encoder\.version",
r"decoder\.version",
r"lm_head\.weight",
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
]

def __init__(self, config: LEDConfig):
Expand Down
16 changes: 8 additions & 8 deletions src/transformers/models/longt5/modeling_longt5.py
Expand Up @@ -1721,11 +1721,11 @@ def custom_forward(*inputs):
)
class LongT5Model(LongT5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder\.embed_tokens\.weight",
r"decoder\.embed_tokens\.weight",
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_unexpected = [
r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight",
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]

def __init__(self, config: LongT5Config):
Expand Down Expand Up @@ -1874,12 +1874,12 @@ def forward(
@add_start_docstrings("""LONGT5 Model with a `language modeling` head on top.""", LONGT5_START_DOCSTRING)
class LongT5ForConditionalGeneration(LongT5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder\.embed_tokens\.weight",
r"decoder\.embed_tokens\.weight",
r"lm_head\.weight",
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
r"lm_head.weight",
]
_keys_to_ignore_on_load_unexpected = [
r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight",
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]

def __init__(self, config: LongT5Config):
Expand Down Expand Up @@ -2116,7 +2116,7 @@ def _reorder_cache(self, past, beam_idx):
)
class LongT5EncoderModel(LongT5PreTrainedModel):
authorized_missing_keys = [
r"encoder\.embed_tokens\.weight",
r"encoder.embed_tokens.weight",
]

def __init__(self, config: LongT5Config):
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/m2m_100/modeling_m2m_100.py
Expand Up @@ -1241,9 +1241,9 @@ def forward(
class M2M100ForConditionalGeneration(M2M100PreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"encoder\.version",
r"decoder\.version",
r"lm_head\.weight",
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
r"model.encoder.embed_positions.weights",
r"model.decoder.embed_positions.weights",
]
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/marian/modeling_marian.py
Expand Up @@ -1272,9 +1272,9 @@ class MarianMTModel(MarianPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"final_logits_bias",
r"encoder\.version",
r"decoder\.version",
r"lm_head\.weight",
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
r"embed_positions",
]

Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/mbart/modeling_mbart.py
Expand Up @@ -1263,9 +1263,9 @@ class MBartForConditionalGeneration(MBartPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"final_logits_bias",
r"encoder\.version",
r"decoder\.version",
r"lm_head\.weight",
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
]

def __init__(self, config: MBartConfig):
Expand Down
18 changes: 9 additions & 9 deletions src/transformers/models/mt5/modeling_mt5.py
Expand Up @@ -49,13 +49,13 @@ class MT5Model(T5Model):
model_type = "mt5"
config_class = MT5Config
_keys_to_ignore_on_load_missing = [
r"encoder\.embed_tokens\.weight",
r"decoder\.embed_tokens\.weight",
r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight",
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]
_keys_to_ignore_on_save = [
r"encoder\.embed_tokens\.weight",
r"decoder\.embed_tokens\.weight",
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
]


Expand Down Expand Up @@ -84,10 +84,10 @@ class MT5ForConditionalGeneration(T5ForConditionalGeneration):
model_type = "mt5"
config_class = MT5Config
_keys_to_ignore_on_load_missing = [
r"encoder\.embed_tokens\.weight",
r"encoder.embed_tokens.weight",
]
_keys_to_ignore_on_save = [
r"encoder\.embed_tokens\.weight",
r"encoder.embed_tokens.weight",
]


Expand All @@ -112,8 +112,8 @@ class MT5EncoderModel(T5EncoderModel):
model_type = "mt5"
config_class = MT5Config
_keys_to_ignore_on_load_missing = [
r"encoder\.embed_tokens\.weight",
r"encoder.embed_tokens.weight",
]
_keys_to_ignore_on_save = [
r"encoder\.embed_tokens\.weight",
r"encoder.embed_tokens.weight",
]
4 changes: 2 additions & 2 deletions src/transformers/models/opt/modeling_opt.py
Expand Up @@ -382,7 +382,7 @@ class OPTPreTrainedModel(PreTrainedModel):
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["OPTDecoderLayer"]
_keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
_keys_to_ignore_on_load_unexpected = [r"decoder.version"]

def _init_weights(self, module):
std = self.config.init_std
Expand Down Expand Up @@ -780,7 +780,7 @@ def forward(


class OPTForCausalLM(OPTPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"lm_head\.weight"]
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]

def __init__(self, config):
super().__init__(config)
Expand Down
8 changes: 4 additions & 4 deletions src/transformers/models/pegasus/modeling_pegasus.py
Expand Up @@ -1290,10 +1290,10 @@ class PegasusForConditionalGeneration(PegasusPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"final_logits_bias",
r"encoder\.version",
r"decoder\.version",
r"lm_head\.weight",
r"embed_positions\.weight",
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
r"embed_positions.weight",
]

def __init__(self, config: PegasusConfig):
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/plbart/modeling_plbart.py
Expand Up @@ -1235,9 +1235,9 @@ class PLBartForConditionalGeneration(PLBartPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"final_logits_bias",
r"encoder\.version",
r"decoder\.version",
r"lm_head\.weight",
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
]

def __init__(self, config: PLBartConfig):
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/roformer/modeling_roformer.py
Expand Up @@ -699,8 +699,8 @@ class RoFormerPreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = []
_keys_to_ignore_on_load_unexpected = [
r"roformer\.embeddings_project\.weight",
r"roformer\.embeddings_project\.bias",
r"roformer.embeddings_project.weight",
r"roformer.embeddings_project.bias",
]

def _init_weights(self, module):
Expand Down
Expand Up @@ -1252,8 +1252,8 @@ def forward(
class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"encoder\.version",
r"decoder\.version",
r"encoder.version",
r"decoder.version",
r"model.encoder.embed_positions.weights",
r"model.decoder.embed_positions.weights",
]
Expand Down
16 changes: 8 additions & 8 deletions src/transformers/models/t5/modeling_t5.py
Expand Up @@ -1266,11 +1266,11 @@ def custom_forward(*inputs):
)
class T5Model(T5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder\.embed_tokens\.weight",
r"decoder\.embed_tokens\.weight",
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_unexpected = [
r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight",
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]

def __init__(self, config: T5Config):
Expand Down Expand Up @@ -1455,12 +1455,12 @@ def forward(
@add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
class T5ForConditionalGeneration(T5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder\.embed_tokens\.weight",
r"decoder\.embed_tokens\.weight",
r"lm_head\.weight",
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
r"lm_head.weight",
]
_keys_to_ignore_on_load_unexpected = [
r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight",
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]

def __init__(self, config: T5Config):
Expand Down Expand Up @@ -1749,7 +1749,7 @@ def _reorder_cache(self, past, beam_idx):
)
class T5EncoderModel(T5PreTrainedModel):
authorized_missing_keys = [
r"encoder\.embed_tokens\.weight",
r"encoder.embed_tokens.weight",
]

def __init__(self, config: T5Config):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/transfo_xl/modeling_transfo_xl.py
Expand Up @@ -1198,7 +1198,7 @@ def _reorder_cache(mems: List[torch.Tensor], beam_idx: torch.Tensor) -> List[tor
TRANSFO_XL_START_DOCSTRING,
)
class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]

def __init__(self, config):
super().__init__(config)
Expand Down