From 2ebd575816d6416d6484e4477184f4e3019aa006 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 30 Jun 2022 17:37:33 +0200
Subject: [PATCH 001/164] initial files

---
 .../time_series_transformer/__init__.py       | 17 ++++++++++++
 .../configuration_time_series_transformer.py  | 24 +++++++++++++++++
 .../modeling_time_series_transformer.py       | 27 +++++++++++++++++++
 3 files changed, 68 insertions(+)
 create mode 100644 src/transformers/models/time_series_transformer/__init__.py
 create mode 100644 src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
 create mode 100644 src/transformers/models/time_series_transformer/modeling_time_series_transformer.py

diff --git a/src/transformers/models/time_series_transformer/__init__.py b/src/transformers/models/time_series_transformer/__init__.py
new file mode 100644
index 0000000000000..ee533a40bfb68
--- /dev/null
+++ b/src/transformers/models/time_series_transformer/__init__.py
@@ -0,0 +1,17 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
new file mode 100644
index 0000000000000..3d76375afcbd8
--- /dev/null
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -0,0 +1,24 @@
+# coding=utf-8
+# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Time Series Transformer model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+class TimeSeriesTransformerConfig(PretrainedConfig):
+    pass
\ No newline at end of file
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
new file mode 100644
index 0000000000000..9225108b443bf
--- /dev/null
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -0,0 +1,27 @@
+# coding=utf-8
+# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Time Series Transformer model."""
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import functional as F
+
+from ...modeling_utils import PreTrainedModel
+
+
+class TimeSeriesTransformerPreTrainedModel(PreTrainedModel):
+    pass
\ No newline at end of file

From cc0b364ca7657328bc767bad1147a7e8b463a60e Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 30 Jun 2022 18:50:18 +0200
Subject: [PATCH 002/164] initial model via cli

---
 .../en/model_doc/time_series_transformer.mdx  |   77 +
 src/transformers/__init__.py                  |   33 +-
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 src/transformers/models/auto/modeling_auto.py |    8 +
 .../time_series_transformer/__init__.py       |   77 +-
 .../configuration_time_series_transformer.py  |  147 +-
 .../modeling_time_series_transformer.py       | 1781 ++++++++++++++++-
 .../time_series_transformer/__init__.py       |    0
 .../test_modeling_time_series_transformer.py  |  603 ++++++
 utils/check_repo.py                           |    6 +
 11 files changed, 2724 insertions(+), 12 deletions(-)
 create mode 100644 docs/source/en/model_doc/time_series_transformer.mdx
 mode change 100644 => 100755 src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
 create mode 100644 tests/models/time_series_transformer/__init__.py
 create mode 100644 tests/models/time_series_transformer/test_modeling_time_series_transformer.py

diff --git a/docs/source/en/model_doc/time_series_transformer.mdx b/docs/source/en/model_doc/time_series_transformer.mdx
new file mode 100644
index 0000000000000..4c768a8058bf2
--- /dev/null
+++ b/docs/source/en/model_doc/time_series_transformer.mdx
@@ -0,0 +1,77 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# TimeSeriesTransformer
+
+## Overview
+
+The TimeSeriesTransformer model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>)  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](<https://huggingface.co/<INSERT YOUR HF USERNAME HERE>). The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+## TimeSeriesTransformerConfig
+
+[[autodoc]] TimeSeriesTransformerConfig
+
+
+## TimeSeriesTransformerTokenizer
+
+[[autodoc]] TimeSeriesTransformerTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+
+## TimeSeriesTransformerTokenizerFast
+
+[[autodoc]] TimeSeriesTransformerTokenizerFast
+
+
+## TimeSeriesTransformerModel
+
+[[autodoc]] TimeSeriesTransformerModel
+    - forward
+
+
+## TimeSeriesTransformerForConditionalGeneration
+
+[[autodoc]] TimeSeriesTransformerForConditionalGeneration
+    - forward
+
+
+## TimeSeriesTransformerForSequenceClassification
+
+[[autodoc]] TimeSeriesTransformerForSequenceClassification
+    - forward
+
+
+## TimeSeriesTransformerForQuestionAnswering
+
+[[autodoc]] TimeSeriesTransformerForQuestionAnswering
+    - forward
+
+
+## TimeSeriesTransformerForCausalLM
+
+[[autodoc]] TimeSeriesTransformerForCausalLM
+    - forward
+
+
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 75837649c9eea..4840b8f120b53 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -308,6 +308,7 @@
     "models.t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config"],
     "models.tapas": ["TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP", "TapasConfig", "TapasTokenizer"],
     "models.tapex": ["TapexTokenizer"],
+    "models.time_series_transformer": ["TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimeSeriesTransformerConfig", "TimeSeriesTransformerTokenizer"],
     "models.trajectory_transformer": [
         "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TrajectoryTransformerConfig",
@@ -505,6 +506,7 @@
     ]
 else:
     # Fast tokenizers structure
+    _import_structure["models.time_series_transformer"].append("TimeSeriesTransformerTokenizerFast")
     _import_structure["models.albert"].append("AlbertTokenizerFast")
     _import_structure["models.bart"].append("BartTokenizerFast")
     _import_structure["models.barthez"].append("BarthezTokenizerFast")
@@ -750,6 +752,18 @@
     _import_structure["modeling_utils"] = ["PreTrainedModel"]
 
     # PyTorch models structure
+
+    _import_structure["models.time_series_transformer"].extend(
+        [
+            "TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TimeSeriesTransformerForCausalLM",
+            "TimeSeriesTransformerForConditionalGeneration",
+            "TimeSeriesTransformerForQuestionAnswering",
+            "TimeSeriesTransformerForSequenceClassification",
+            "TimeSeriesTransformerModel",
+            "TimeSeriesTransformerPreTrainedModel",
+        ]
+    )
     _import_structure["models.albert"].extend(
         [
             "ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3013,6 +3027,11 @@
     from .models.t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
     from .models.tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig, TapasTokenizer
     from .models.tapex import TapexTokenizer
+    from .models.time_series_transformer import (
+        TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TimeSeriesTransformerConfig,
+        TimeSeriesTransformerTokenizer,
+    )
     from .models.trajectory_transformer import (
         TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TrajectoryTransformerConfig,
@@ -3235,6 +3254,7 @@
         from .models.splinter import SplinterTokenizerFast
         from .models.squeezebert import SqueezeBertTokenizerFast
         from .models.t5 import T5TokenizerFast
+        from .models.time_series_transformer import TimeSeriesTransformerTokenizerFast
         from .models.xglm import XGLMTokenizerFast
         from .models.xlm_roberta import XLMRobertaTokenizerFast
         from .models.xlnet import XLNetTokenizerFast
@@ -3385,8 +3405,6 @@
         )
         from .generation_utils import top_k_top_p_filtering
         from .modeling_utils import PreTrainedModel
-
-        # PyTorch model imports
         from .models.albert import (
             ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             AlbertForMaskedLM,
@@ -4198,6 +4216,17 @@
             T5PreTrainedModel,
             load_tf_weights_in_t5,
         )
+
+        # PyTorch model imports
+        from .models.time_series_transformer import (
+            TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TimeSeriesTransformerForCausalLM,
+            TimeSeriesTransformerForConditionalGeneration,
+            TimeSeriesTransformerForQuestionAnswering,
+            TimeSeriesTransformerForSequenceClassification,
+            TimeSeriesTransformerModel,
+            TimeSeriesTransformerPreTrainedModel,
+        )
         from .models.trajectory_transformer import (
             TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             TrajectoryTransformerModel,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index c4b48e6cec658..ee4752a9166b1 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -128,6 +128,7 @@
     t5,
     tapas,
     tapex,
+    time_series_transformer,
     trajectory_transformer,
     transfo_xl,
     trocr,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 4e32b510b0c1e..f6aaccd0f4ac1 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -29,6 +29,7 @@
 CONFIG_MAPPING_NAMES = OrderedDict(
     [
         # Add configs here
+        ("time_series_transformer", "TimeSeriesTransformerConfig"),
         ("albert", "AlbertConfig"),
         ("bart", "BartConfig"),
         ("beit", "BeitConfig"),
@@ -153,6 +154,7 @@
 CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
     [
         # Add archive maps here)
+        ("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bart", "BART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -262,6 +264,7 @@
 MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
+        ("time_series_transformer", "TimeSeriesTransformer"),
         ("albert", "ALBERT"),
         ("bart", "BART"),
         ("barthez", "BARThez"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 51c63aaf5dd4a..0df2ee1f30584 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -28,6 +28,7 @@
 MODEL_MAPPING_NAMES = OrderedDict(
     [
         # Base model mapping
+        ("time_series_transformer", "TimeSeriesTransformerModel"),
         ("albert", "AlbertModel"),
         ("bart", "BartModel"),
         ("beit", "BeitModel"),
@@ -197,6 +198,8 @@
 MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
     [
         # Model with LM heads mapping
+
+        ("time_series_transformer", "TimeSeriesTransformerForConditionalGeneration"),
         ("albert", "AlbertForMaskedLM"),
         ("bart", "BartForConditionalGeneration"),
         ("bert", "BertForMaskedLM"),
@@ -259,6 +262,7 @@
 MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Causal LM mapping
+        ("time_series_transformer", "TimeSeriesTransformerForCausalLM"),
         ("bart", "BartForCausalLM"),
         ("bert", "BertLMHeadModel"),
         ("bert-generation", "BertGenerationDecoder"),
@@ -434,6 +438,8 @@
 MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Seq2Seq Causal LM mapping
+
+        ("time_series_transformer", "TimeSeriesTransformerForConditionalGeneration"),
         ("bart", "BartForConditionalGeneration"),
         ("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"),
         ("blenderbot", "BlenderbotForConditionalGeneration"),
@@ -465,6 +471,7 @@
 MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Sequence Classification mapping
+        ("time_series_transformer", "TimeSeriesTransformerForSequenceClassification"),
         ("albert", "AlbertForSequenceClassification"),
         ("bart", "BartForSequenceClassification"),
         ("bert", "BertForSequenceClassification"),
@@ -521,6 +528,7 @@
 MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
     [
         # Model for Question Answering mapping
+        ("time_series_transformer", "TimeSeriesTransformerForQuestionAnswering"),
         ("albert", "AlbertForQuestionAnswering"),
         ("bart", "BartForQuestionAnswering"),
         ("bert", "BertForQuestionAnswering"),
diff --git a/src/transformers/models/time_series_transformer/__init__.py b/src/transformers/models/time_series_transformer/__init__.py
index ee533a40bfb68..d7ffb0e4ff25a 100644
--- a/src/transformers/models/time_series_transformer/__init__.py
+++ b/src/transformers/models/time_series_transformer/__init__.py
@@ -2,7 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.
 
-# Copyright 2022 The HuggingFace Team. All rights reserved.
+# Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,3 +15,78 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import  _LazyModule, OptionalDependencyNotAvailable, is_tokenizers_available
+from ...utils import is_torch_available
+
+
+
+
+_import_structure = {
+    "configuration_time_series_transformer": ["TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimeSeriesTransformerConfig"],
+    "tokenization_time_series_transformer": ["TimeSeriesTransformerTokenizer"],
+}
+
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_time_series_transformer_fast"] = ["TimeSeriesTransformerTokenizerFast"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_time_series_transformer"] = [
+        "TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TimeSeriesTransformerForConditionalGeneration",
+        "TimeSeriesTransformerForQuestionAnswering",
+        "TimeSeriesTransformerForSequenceClassification",
+        "TimeSeriesTransformerForCausalLM",
+        "TimeSeriesTransformerModel",
+        "TimeSeriesTransformerPreTrainedModel",
+    ]
+
+
+
+
+if TYPE_CHECKING:
+    from .configuration_time_series_transformer import TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TimeSeriesTransformerConfig
+    from .tokenization_time_series_transformer import TimeSeriesTransformerTokenizer
+
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_time_series_transformer_fast import TimeSeriesTransformerTokenizerFast
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_time_series_transformer import (
+            TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TimeSeriesTransformerForConditionalGeneration,
+            TimeSeriesTransformerForCausalLM,
+            TimeSeriesTransformerForQuestionAnswering,
+            TimeSeriesTransformerForSequenceClassification,
+            TimeSeriesTransformerModel,
+            TimeSeriesTransformerPreTrainedModel,
+        )
+
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 3d76375afcbd8..ee3f7705b4f90 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 kashif and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Time Series Transformer model configuration"""
+""" TimeSeriesTransformer model configuration """
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -20,5 +20,146 @@
 
 logger = logging.get_logger(__name__)
 
+TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "huggingface/tst-ett": "https://huggingface.co/huggingface/tst-ett/resolve/main/config.json",
+    # See all TimeSeriesTransformer models at https://huggingface.co/models?filter=time_series_transformer
+}
+
+
 class TimeSeriesTransformerConfig(PretrainedConfig):
-    pass
\ No newline at end of file
+    r"""
+    This is the configuration class to store the configuration of a [`~TimeSeriesTransformerModel`].
+    It is used to instantiate an TimeSeriesTransformer model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the TimeSeriesTransformer [huggingface/tst-ett](https://huggingface.co/huggingface/tst-ett) architecture.
+
+    Configuration objects inherit from  [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
+    for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the TimeSeriesTransformer model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~TimeSeriesTransformerModel`] or
+            [`~TFTimeSeriesTransformerModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimension of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        Example:
+
+    ```python
+    >>> from transformers import TimeSeriesTransformerModel, TimeSeriesTransformerConfig
+
+    >>> # Initializing a TimeSeriesTransformer huggingface/tst-ett style configuration
+    >>> configuration = TimeSeriesTransformerConfig()
+
+    >>> # Initializing a model from the huggingface/tst-ett style configuration
+    >>> model = TimeSeriesTransformerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+"""
+    model_type = "time_series_transformer"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    
+    attribute_map = {
+        "num_attention_heads": "encoder_attention_heads",
+        "hidden_size": "d_model"
+    }
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=2,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs
+        )
+
+    
\ No newline at end of file
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
old mode 100644
new mode 100755
index 9225108b443bf..caa0644185b9c
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 kashif The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,16 +12,1785 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Time Series Transformer model."""
+""" PyTorch TimeSeriesTransformer model. """
+
+
+import math
+import copy
+import random
+from typing import Optional, Tuple, List, Union
 
-import numpy as np
 import torch
-import torch.utils.checkpoint
 from torch import nn
-from torch.nn import functional as F
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
+from ...activations import ACT2FN
+from ...utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
+    CausalLMOutputWithCrossAttentions
+)
 from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from .configuration_time_series_transformer import TimeSeriesTransformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "huggingface/tst-ett"
+_CONFIG_FOR_DOC = "TimeSeriesTransformerConfig"
+_TOKENIZER_FOR_DOC = "TimeSeriesTransformerTokenizer"
+
+
+TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "huggingface/tst-ett",
+    # See all TimeSeriesTransformer models at https://huggingface.co/models?filter=time_series_transformer
+]
+
+
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+def _expand_mask(
+    mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
+):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+class TimeSeriesTransformerLearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        super().__init__(num_embeddings, embedding_dim)
+
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions)
+
+
+class TimeSeriesTransformerAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+        self.scaling = self.head_dim ** -0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class TimeSeriesTransformerEncoderLayer(nn.Module):
+    def __init__(self, config: TimeSeriesTransformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = TimeSeriesTransformerAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                *(config.encoder_attention_heads,)*.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if hidden_states.dtype == torch.float16 and (torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class TimeSeriesTransformerDecoderLayer(nn.Module):
+    def __init__(self, config: TimeSeriesTransformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = TimeSeriesTransformerAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = TimeSeriesTransformerAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                *(encoder_attention_heads,)*.
+            cross_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size *(decoder_attention_heads,)*.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class TimeSeriesTransformerClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(
+        self,
+        input_dim: int,
+        inner_dim: int,
+        num_classes: int,
+        pooler_dropout: float,
+    ):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
 
 
 class TimeSeriesTransformerPreTrainedModel(PreTrainedModel):
-    pass
\ No newline at end of file
+    config_class = TimeSeriesTransformerConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (TimeSeriesTransformerDecoder, TimeSeriesTransformerEncoder)):
+            module.gradient_checkpointing = value
+
+
+TIME_SERIES_TRANSFORMER_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config ([`~TimeSeriesTransformerConfig`]):
+            Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+"""
+
+TIME_SERIES_TRANSFORMER_GENERATION_EXAMPLE = r"""
+    Summarization example:
+
+    ```python
+    >>> from transformers import TimeSeriesTransformerTokenizer, TimeSeriesTransformerForConditionalGeneration
+
+    >>> model = TimeSeriesTransformerForConditionalGeneration.from_pretrained('huggingface/tst-ett')
+    >>> tokenizer = TimeSeriesTransformerTokenizer.from_pretrained('huggingface/tst-ett')
+
+    >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+
+    >>> # Generate Summary
+    >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5)
+    >>> print(tokenizer.decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
+"""
+
+TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`~TimeSeriesTransformerTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Provide for translation and summarization training. By default, the model will create this tensor by
+            shifting the `input_ids` to the right, following the paper.
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
+            also be used by default.
+
+            If you want to change padding behavior, you should read [`modeling_time_series_transformer._prepare_decoder_attention_mask`] and
+            modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+            `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+            *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+            cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+            of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+            shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+            instead of all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds`
+            have to be input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds`
+            takes the value of `inputs_embeds`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+            decoding (see `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+TIME_SERIES_TRANSFORMER_STANDALONE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`ProphetNetTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class TimeSeriesTransformerEncoder(TimeSeriesTransformerPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`TimeSeriesTransformerEncoderLayer`].
+
+    Args:
+        config: TimeSeriesTransformerConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: TimeSeriesTransformerConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+
+        self.embed_positions = TimeSeriesTransformerLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+        )
+        self.layers = nn.ModuleList([TimeSeriesTransformerEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`~TimeSeriesTransformerTokenizer`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+                for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert `input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        embed_pos = self.embed_positions(input_shape)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TimeSeriesTransformerDecoderLayer`]
+
+    Args:
+        config: TimeSeriesTransformerConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: TimeSeriesTransformerConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        self.embed_positions = TimeSeriesTransformerLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+        )
+        self.layers = nn.ModuleList([TimeSeriesTransformerDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(self.device)
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`~TimeSeriesTransformerTokenizer`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+                for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+                tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+                tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
+                decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last
+                `decoder_input_ids` (those that don't have their past key value states given to this model) of
+                shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+                sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        attention_mask = self._prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds, past_key_values_length)
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                assert attn_mask.size()[0] == (
+                    len(self.layers)
+                ), f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning("`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`...")
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare TimeSeriesTransformer Model outputting raw hidden-states without any specific head on top.",
+    TIME_SERIES_TRANSFORMER_START_DOCSTRING,
+)
+class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
+    def __init__(self, config: TimeSeriesTransformerConfig):
+        super().__init__(config)
+
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+
+        self.encoder = TimeSeriesTransformerEncoder(config, self.shared)
+        self.decoder = TimeSeriesTransformerDecoder(config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Seq2SeqModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The TimeSeriesTransformer Model with a language modeling head. Can be used for summarization.", TIME_SERIES_TRANSFORMER_START_DOCSTRING
+)
+class TimeSeriesTransformerForConditionalGeneration(TimeSeriesTransformerPreTrainedModel):
+    base_model_prefix = "model"
+    _keys_to_ignore_on_load_missing = [
+        r"final_logits_bias",
+        r"encoder\.version",
+        r"decoder\.version",
+        r"lm_head\.weight",
+    ]
+
+    def __init__(self, config: TimeSeriesTransformerConfig):
+        super().__init__(config)
+        self.model = TimeSeriesTransformerModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        self._resize_final_logits_bias(new_num_tokens)
+        return new_embeddings
+
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @add_end_docstrings(TIME_SERIES_TRANSFORMER_GENERATION_EXAMPLE)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Conditional generation example:
+
+        ```python
+        >>> from transformers import TimeSeriesTransformerTokenizer, TimeSeriesTransformerForConditionalGeneration
+        >>> tokenizer = TimeSeriesTransformerTokenizer.from_pretrained('huggingface/tst-ett')
+        >>> TXT = "My friends are <mask> but they eat too many carbs."
+
+        >>> model = TimeSeriesTransformerForConditionalGeneration.from_pretrained('huggingface/tst-ett')
+        >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+        >>> logits = model(input_ids).logits
+
+        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+        >>> probs = logits[0, masked_index].softmax(dim=0)
+        >>> values, predictions = probs.topk(5)
+
+        >>> tokenizer.decode(predictions).split()
+        ```
+"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    TimeSeriesTransformer model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
+    tasks.
+    """,
+    TIME_SERIES_TRANSFORMER_START_DOCSTRING,
+)
+class TimeSeriesTransformerForSequenceClassification(TimeSeriesTransformerPreTrainedModel):
+    def __init__(self, config: TimeSeriesTransformerConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = TimeSeriesTransformerModel(config)
+        self.classification_head = TimeSeriesTransformerClassificationHead(
+            config.d_model,
+            config.d_model,
+            config.num_labels,
+            config.classifier_dropout,
+        )
+        self.model._init_weights(self.classification_head.dense)
+        self.model._init_weights(self.classification_head.out_proj)
+
+    @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Seq2SeqSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]  # last hidden state
+
+        eos_mask = input_ids.eq(self.config.eos_token_id)
+
+        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
+            raise ValueError("All examples must have the same number of <eos> tokens.")
+        sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
+            :, -1, :
+        ]
+        logits = self.classification_head(sentence_representation)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.config.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.config.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    TimeSeriesTransformer Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    TIME_SERIES_TRANSFORMER_START_DOCSTRING,
+)
+class TimeSeriesTransformerForQuestionAnswering(TimeSeriesTransformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        config.num_labels = 2
+        self.num_labels = config.num_labels
+
+        self.model = TimeSeriesTransformerModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.model._init_weights(self.qa_outputs)
+
+    @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Seq2SeqQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        start_positions=None,
+        end_positions=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if start_positions is not None and end_positions is not None:
+            use_cache = False
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (
+                start_logits,
+                end_logits,
+            ) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return Seq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+class TimeSeriesTransformerDecoderWrapper(TimeSeriesTransformerPreTrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
+    used in combination with the [`EncoderDecoderModel`] framework.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.decoder = TimeSeriesTransformerDecoder(config)
+
+    def forward(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+
+
+class TimeSeriesTransformerForCausalLM(TimeSeriesTransformerPreTrainedModel):
+    def __init__(self, config):
+        config = copy.deepcopy(config)
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        super().__init__(config)
+        self.model = TimeSeriesTransformerDecoderWrapper(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model.decoder = decoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`~TimeSeriesTransformerTokenizer`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+                for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
+                decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+                (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+                instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
+                ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+                decoding (see `past_key_values`).
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import TimeSeriesTransformerTokenizer, TimeSeriesTransformerForCausalLM
+
+        >>> tokenizer = TimeSeriesTransformerTokenizer.from_pretrained('facebook/bart-large')
+        >>> model = TimeSeriesTransformerForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
+        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> logits = outputs.logits
+        ```
+"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = self.lm_head(outputs[0])
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=None, **kwargs):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+
+        if past:
+            input_ids = input_ids[:, -1:]
+        # first step, decoder_cached_states are empty
+        return {
+            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "use_cache": use_cache,
+        }
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
diff --git a/tests/models/time_series_transformer/__init__.py b/tests/models/time_series_transformer/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
new file mode 100644
index 0000000000000..6172a1f2ae44a
--- /dev/null
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -0,0 +1,603 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch TimeSeriesTransformer model. """
+
+
+import copy
+import tempfile
+import unittest
+
+from transformers import is_torch_available
+from transformers.utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...generation.test_generation_utils import GenerationTesterMixin
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        TimeSeriesTransformerConfig,
+        TimeSeriesTransformerForConditionalGeneration,
+        TimeSeriesTransformerForQuestionAnswering,
+        TimeSeriesTransformerForCausalLM,
+        TimeSeriesTransformerForSequenceClassification,
+        TimeSeriesTransformerModel,
+        TimeSeriesTransformerTokenizer,
+    )
+    from transformers.models.time_series_transformer.modeling_time_series_transformer import (
+        TimeSeriesTransformerDecoder,
+        TimeSeriesTransformerEncoder,
+    )
+
+
+def prepare_time_series_transformer_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+    }
+
+
+@require_torch
+class TimeSeriesTransformerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
+        )
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = TimeSeriesTransformerConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+        inputs_dict = prepare_time_series_transformer_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = TimeSeriesTransformerModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = TimeSeriesTransformerModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = TimeSeriesTransformerEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = TimeSeriesTransformerDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class TimeSeriesTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (TimeSeriesTransformerModel, TimeSeriesTransformerForConditionalGeneration, TimeSeriesTransformerForSequenceClassification, TimeSeriesTransformerForQuestionAnswering)
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (TimeSeriesTransformerForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+
+    def setUp(self):
+        self.model_tester = TimeSeriesTransformerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=TimeSeriesTransformerConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    # TimeSeriesTransformerForSequenceClassification does not support inputs_embeds
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in (TimeSeriesTransformerModel, TimeSeriesTransformerForConditionalGeneration, TimeSeriesTransformerForQuestionAnswering):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = TimeSeriesTransformerForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+def assert_tensors_close(a, b, atol=1e-12, prefix=""):
+    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
+        if a.numel() > 100:
+            msg = f"tensor values are {pct_different:.1%} percent different."
+        else:
+            msg = f"{a} != {b}"
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
+
+
+TOLERANCE = 1e-4
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+@slow
+class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_tokenizer(self):
+        return TimeSeriesTransformerTokenizer.from_pretrained('huggingface/tst-ett')
+
+    def test_inference_no_head(self):
+        model = TimeSeriesTransformerModel.from_pretrained('huggingface/tst-ett').to(torch_device)
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        decoder_input_ids = _long_tensor([[2, 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588]])
+        inputs_dict = prepare_time_series_transformer_inputs_dict(model.config, input_ids, decoder_input_ids)
+        with torch.no_grad():
+            output = model(**inputs_dict)[0]
+        expected_shape = torch.Size((1, 11, 1024))
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = torch.tensor(
+            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_inference_head(self):
+        model = TimeSeriesTransformerForConditionalGeneration.from_pretrained('huggingface/tst-ett').to(torch_device)
+
+        # change to intended input
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        inputs_dict = prepare_time_series_transformer_inputs_dict(model.config, input_ids, decoder_input_ids)
+        with torch.no_grad():
+            output = model(**inputs_dict)[0]
+        expected_shape = torch.Size((1, 11, model.config.vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = torch.tensor(
+            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_seq_to_seq_generation(self):
+        hf = TimeSeriesTransformerForConditionalGeneration.from_pretrained('huggingface/tst-ett').to(torch_device)
+        tok = TimeSeriesTransformerTokenizer.from_pretrained('huggingface/tst-ett')
+
+        batch_input = [
+            # string 1,
+            # string 2,
+            # string 3,
+            # string 4,
+        ]
+
+        # The below article tests that we don't add any hypotheses outside of the top n_beams
+        dct = tok.batch_encode_plus(
+            batch_input,
+            max_length=512,
+            padding="max_length",
+            truncation_strategy="only_first",
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        hypotheses_batch = hf.generate(
+            input_ids=dct["input_ids"].to(torch_device),
+            attention_mask=dct["attention_mask"].to(torch_device),
+            num_beams=2,
+        )
+
+        EXPECTED = [
+            # here expected 1,
+            # here expected 2,
+            # here expected 3,
+            # here expected 4,
+        ]
+
+        generated = tok.batch_decode(
+            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
+        )
+        assert generated == EXPECTED
+
+
+class TimeSeriesTransformerStandaloneDecoderModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        d_model=16,
+        decoder_seq_length=7,
+        is_training=True,
+        is_decoder=True,
+        use_attention_mask=True,
+        use_cache=False,
+        use_labels=True,
+        decoder_start_token_id=2,
+        decoder_ffn_dim=32,
+        decoder_layers=4,
+        encoder_attention_heads=4,
+        decoder_attention_heads=4,
+        max_position_embeddings=30,
+        is_encoder_decoder=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.hidden_size = d_model
+        self.num_hidden_layers = decoder_layers
+        self.decoder_layers = decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.num_attention_heads = decoder_attention_heads
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.use_cache = use_cache
+        self.max_position_embeddings = max_position_embeddings
+        self.is_encoder_decoder = is_encoder_decoder
+
+        self.scope = None
+        self.decoder_key_length = decoder_seq_length
+        self.base_model_out_len = 2
+        self.decoder_attention_idx = 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        config = TimeSeriesTransformerConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.d_model,
+            decoder_layers=self.decoder_layers,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            encoder_attention_heads=self.encoder_attention_heads,
+            decoder_attention_heads=self.decoder_attention_heads,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            use_cache=self.use_cache,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+            max_position_embeddings=self.max_position_embeddings,
+            is_encoder_decoder=self.is_encoder_decoder,
+        )
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        )
+
+    def create_and_check_decoder_model_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        config.use_cache = True
+        model = TimeSeriesTransformerDecoder(config=config).to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        past_key_values = outputs["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def create_and_check_decoder_model_attention_mask_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        model = TimeSeriesTransformerDecoder(config=config).to(torch_device).eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        half_seq_length = input_ids.shape[-1] // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class TimeSeriesTransformerStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (TimeSeriesTransformerDecoder, TimeSeriesTransformerForCausalLM) if is_torch_available() else ()
+    all_generative_model_classes = (TimeSeriesTransformerForCausalLM,) if is_torch_available() else ()
+    test_pruning = False
+    is_encoder_decoder = False
+
+    def setUp(
+        self,
+    ):
+        self.model_tester = TimeSeriesTransformerStandaloneDecoderModelTester(self, is_training=False)
+        self.config_tester = ConfigTester(self, config_class=TimeSeriesTransformerConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
+
+    def test_decoder_model_attn_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        return
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 9905bb00544b7..2dbcb5aafa9aa 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -46,6 +46,9 @@
 # Being in this list is an exception and should **not** be the rule.
 IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
     # models to ignore for not tested
+"TimeSeriesTransformerEncoder",  # Building part of bigger (tested) model.
+    "TimeSeriesTransformerDecoder",  # Building part of bigger (tested) model.
+    "TimeSeriesTransformerDecoderWrapper", # Building part of bigger (tested) model.
     "OPTDecoder",  # Building part of bigger (tested) model.
     "DecisionTransformerGPT2Model",  # Building part of bigger (tested) model.
     "SegformerDecodeHead",  # Building part of bigger (tested) model.
@@ -124,6 +127,9 @@
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     # models to ignore for model xxx mapping
+"TimeSeriesTransformerEncoder",
+    "TimeSeriesTransformerDecoder",
+    "TimeSeriesTransformerDecoderWrapper",
     "DPTForDepthEstimation",
     "DecisionTransformerGPT2Model",
     "GLPNForDepthEstimation",

From cf60b951e83984d44f7932784d537cda38dce892 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 1 Jul 2022 10:37:57 +0200
Subject: [PATCH 003/164] typos

---
 .../modeling_time_series_transformer.py                       | 2 +-
 utils/check_repo.py                                           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index caa0644185b9c..ae6dbb5e4ea23 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -232,7 +232,7 @@ def forward(
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 2dbcb5aafa9aa..c07312e0678e1 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -46,7 +46,7 @@
 # Being in this list is an exception and should **not** be the rule.
 IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
     # models to ignore for not tested
-"TimeSeriesTransformerEncoder",  # Building part of bigger (tested) model.
+    "TimeSeriesTransformerEncoder",  # Building part of bigger (tested) model.
     "TimeSeriesTransformerDecoder",  # Building part of bigger (tested) model.
     "TimeSeriesTransformerDecoderWrapper", # Building part of bigger (tested) model.
     "OPTDecoder",  # Building part of bigger (tested) model.
@@ -127,7 +127,7 @@
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     # models to ignore for model xxx mapping
-"TimeSeriesTransformerEncoder",
+    "TimeSeriesTransformerEncoder",
     "TimeSeriesTransformerDecoder",
     "TimeSeriesTransformerDecoderWrapper",
     "DPTForDepthEstimation",

From b18d34557491490c81fe35fc642e3b3d27ab29ed Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 1 Jul 2022 11:44:18 +0200
Subject: [PATCH 004/164] make a start on the model config

---
 .../configuration_time_series_transformer.py  | 98 +++++++------------
 1 file changed, 36 insertions(+), 62 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index ee3f7705b4f90..a0b69187a52ca 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -39,48 +39,34 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
 
 
     Args:
-        vocab_size (`int`, *optional*, defaults to 50265):
-            Vocabulary size of the TimeSeriesTransformer model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`~TimeSeriesTransformerModel`] or
-            [`~TFTimeSeriesTransformerModel`].
-        d_model (`int`, *optional*, defaults to 1024):
-            Dimension of the layers and the pooler layer.
-        encoder_layers (`int`, *optional*, defaults to 12):
+        prediction_length (`int`):
+            The prediction horizon for the model.
+        context_length (`int`, *optional*, default to `None`):
+            The context length for the encoder. If  `None`, the context length will be the same as the prediction length.
+        distr_output (`DistributionOutput` default to `StudentTOutput()`):
+            The distribution emission head for the model.
+        scaling (`bool` default to `True`):
+            Whether to scale the input targets.
+        freq (`str`, *optional* default to `None`):
+            The frequency of the input time series. If `None`, the `lag_seq` and `time_features` must be provided.
+        lags_seq (`list` of `int`, *optional* default to `None`):
+            The lags of the input time series. Cannot be `None` if `freq` is `None`.
+        time_features (`list` of `TimeFeature`, *optional* default to `None`):
+            The time features transformations to apply to the input time series. Cannot be `None` if `freq` is `None`.
+        encoder_layers (`int`, *optional*, defaults to 2):
             Number of encoder layers.
-        decoder_layers (`int`, *optional*, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 2):
             Number of decoder layers.
-        encoder_attention_heads (`int`, *optional*, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (`int`, *optional*, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
-            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
-            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        nhead (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer encoder and decoder.
+        ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in encoder and decoder.
         activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+            The non-linear activation function (function or string) in the encoder and decoder. If string,
+            `"gelu"` and `"relu"` are supported.
         dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        activation_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
-        max_position_embeddings (`int`, *optional*, defaults to 1024):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        init_std (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
-            https://arxiv.org/abs/1909.11556) for more details.
-        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
-            https://arxiv.org/abs/1909.11556) for more details.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
+            The dropout probability for all fully connected layers in the encoder, and decoder.
+
         Example:
 
     ```python
@@ -97,32 +83,25 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
     ```
 """
     model_type = "time_series_transformer"
-    keys_to_ignore_at_inference = ["past_key_values"]
+    # keys_to_ignore_at_inference = ["past_key_values"]
     
-    attribute_map = {
-        "num_attention_heads": "encoder_attention_heads",
-        "hidden_size": "d_model"
-    }
+    # attribute_map = {
+    #     "num_attention_heads": "encoder_attention_heads",
+    #     "hidden_size": "d_model"
+    # }
 
     def __init__(
         self,
-        vocab_size=50265,
-        max_position_embeddings=1024,
-        encoder_layers=12,
-        encoder_ffn_dim=4096,
-        encoder_attention_heads=16,
-        decoder_layers=12,
-        decoder_ffn_dim=4096,
-        decoder_attention_heads=16,
-        encoder_layerdrop=0.0,
-        decoder_layerdrop=0.0,
-        use_cache=True,
+        prediction_length,
+        context_length=None,
+        ffn_dim=32,
+        nhead=2,
+        freq=None,
+        encoder_layers=2,
+        decoder_layers=2,
         is_encoder_decoder=True,
         activation_function="gelu",
-        d_model=1024,
         dropout=0.1,
-        attention_dropout=0.0,
-        activation_dropout=0.0,
         init_std=0.02,
         decoder_start_token_id=2,
         classifier_dropout=0.0,
@@ -149,16 +128,11 @@ def __init__(
         self.encoder_layerdrop = encoder_layerdrop
         self.decoder_layerdrop = decoder_layerdrop
         self.classifier_dropout = classifier_dropout
-        self.use_cache = use_cache
         self.num_hidden_layers = encoder_layers
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
 
         super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
             is_encoder_decoder=is_encoder_decoder,
-            decoder_start_token_id=decoder_start_token_id,
             **kwargs
         )
 

From eb328f5f78d1a129f272f24fcc907db0cbdbcd84 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 1 Jul 2022 13:00:59 +0200
Subject: [PATCH 005/164] ready with configuation

---
 .../configuration_time_series_transformer.py  | 96 ++++++++++---------
 1 file changed, 51 insertions(+), 45 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index a0b69187a52ca..41f71af828c61 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -13,6 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ TimeSeriesTransformer model configuration """
+from typing import List, Optional
+
+from gluonts.time_feature import TimeFeature, time_features_from_frequency_str, get_lags_for_frequency
+from gluonts.torch.distributions import DistributionOutput, StudentTOutput
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -53,6 +57,16 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             The lags of the input time series. Cannot be `None` if `freq` is `None`.
         time_features (`list` of `TimeFeature`, *optional* default to `None`):
             The time features transformations to apply to the input time series. Cannot be `None` if `freq` is `None`.
+        num_feat_dynamic_real (`int`, *optional* default to `0`):
+            The number of dynamic real valued features.
+        num_feat_static_cat (`int`, *optional* default to `0`):
+            The number of static categorical features.
+        num_feat_static_real (`int`, *optional* default to `0`):
+            The number of static real valued features.
+        cardinality (`list` of `int`, *optional* default to `None`):
+            The cardinality of the categorical features. Cannot be `None` if `num_feat_static_cat` is `> 0`.
+        embedding_dimension (`list` of `int`, *optional* default to `None`):
+            The dimension of the embedding for the categorical features. Cannot be `None` if `num_feat_static_cat` is `> 0`.
         encoder_layers (`int`, *optional*, defaults to 2):
             Number of encoder layers.
         decoder_layers (`int`, *optional*, defaults to 2):
@@ -80,60 +94,52 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
-    ```
-"""
+    ```"""
     model_type = "time_series_transformer"
-    # keys_to_ignore_at_inference = ["past_key_values"]
-    
-    # attribute_map = {
-    #     "num_attention_heads": "encoder_attention_heads",
-    #     "hidden_size": "d_model"
-    # }
 
     def __init__(
         self,
         prediction_length,
-        context_length=None,
-        ffn_dim=32,
-        nhead=2,
-        freq=None,
-        encoder_layers=2,
-        decoder_layers=2,
-        is_encoder_decoder=True,
-        activation_function="gelu",
-        dropout=0.1,
-        init_std=0.02,
-        decoder_start_token_id=2,
-        classifier_dropout=0.0,
-        scale_embedding=False,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
+        context_length: Optional[int] = None,
+        freq: Optional[str] = None,
+        distr_output: DistributionOutput = StudentTOutput(),
+        lags_seq: Optional[List[int]] = None,
+        time_features: Optional[List[TimeFeature]] = None,
+        scaling: bool = True,
+        num_feat_dynamic_real: int = 0,
+        num_feat_static_cat: int = 0,
+        num_feat_static_real: int = 0,
+        cardinality: Optional[List[int]] = None,
+        embedding_dimension: Optional[List[int]] = None,
+        ffn_dim: int = 32,
+        nhead: int = 2,
+        encoder_layers: int = 2,
+        decoder_layers: int = 2,
+        is_encoder_decoder: bool = True,
+        activation_function: str = "gelu",
+        dropout: float = 0.1,
+        init_std: float = 0.02,
         **kwargs
     ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.d_model = d_model
-        self.encoder_ffn_dim = encoder_ffn_dim
+        self.context_length = context_length or prediction_length
+        self.prediction_length = prediction_length
+        self.distr_output = distr_output
+        self.time_features = time_features or time_features_from_frequency_str(freq)
+        self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=freq)
+        self.scaling = scaling
+        self.num_feat_dynamic_real = num_feat_dynamic_real
+        self.num_feat_static_cat = num_feat_static_cat
+        self.num_feat_static_real = num_feat_static_real
+        self.cardinality = cardinality if cardinality and num_feat_static_cat > 0 else [1]
+        self.embedding_dimension = embedding_dimension
+
+        # Transformer architecture parameters
+        self.nhead = nhead
         self.encoder_layers = encoder_layers
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_ffn_dim = decoder_ffn_dim
         self.decoder_layers = decoder_layers
-        self.decoder_attention_heads = decoder_attention_heads
+        self.ffn_dim = ffn_dim
         self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
         self.activation_function = activation_function
         self.init_std = init_std
-        self.encoder_layerdrop = encoder_layerdrop
-        self.decoder_layerdrop = decoder_layerdrop
-        self.classifier_dropout = classifier_dropout
-        self.num_hidden_layers = encoder_layers
-        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
-
-        super().__init__(
-            is_encoder_decoder=is_encoder_decoder,
-            **kwargs
-        )
-
-    
\ No newline at end of file
+
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)

From 8e664f46456c86583c17f26991a8c67b07e0b98a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 1 Jul 2022 14:53:18 +0200
Subject: [PATCH 006/164] remove tokenizer ref.

---
 .../time_series_transformer/__init__.py       | 27 +++++---------
 .../modeling_time_series_transformer.py       | 36 +++++++++++--------
 2 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/__init__.py b/src/transformers/models/time_series_transformer/__init__.py
index d7ffb0e4ff25a..ec73a76c32cc4 100644
--- a/src/transformers/models/time_series_transformer/__init__.py
+++ b/src/transformers/models/time_series_transformer/__init__.py
@@ -18,14 +18,15 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...utils import  _LazyModule, OptionalDependencyNotAvailable, is_tokenizers_available
+from ...utils import _LazyModule, OptionalDependencyNotAvailable, is_tokenizers_available
 from ...utils import is_torch_available
 
 
-
-
 _import_structure = {
-    "configuration_time_series_transformer": ["TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimeSeriesTransformerConfig"],
+    "configuration_time_series_transformer": [
+        "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "TimeSeriesTransformerConfig",
+    ],
     "tokenization_time_series_transformer": ["TimeSeriesTransformerTokenizer"],
 }
 
@@ -54,19 +55,11 @@
     ]
 
 
-
-
 if TYPE_CHECKING:
-    from .configuration_time_series_transformer import TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TimeSeriesTransformerConfig
-    from .tokenization_time_series_transformer import TimeSeriesTransformerTokenizer
-
-    try:
-        if not is_tokenizers_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .tokenization_time_series_transformer_fast import TimeSeriesTransformerTokenizerFast
+    from .configuration_time_series_transformer import (
+        TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TimeSeriesTransformerConfig,
+    )
 
     try:
         if not is_torch_available():
@@ -84,8 +77,6 @@
             TimeSeriesTransformerPreTrainedModel,
         )
 
-
-
 else:
     import sys
 
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index ae6dbb5e4ea23..156d76a2fe34c 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -39,7 +39,7 @@
     Seq2SeqModelOutput,
     Seq2SeqQuestionAnsweringModelOutput,
     Seq2SeqSequenceClassifierOutput,
-    CausalLMOutputWithCrossAttentions
+    CausalLMOutputWithCrossAttentions,
 )
 from ...modeling_utils import PreTrainedModel
 from ...utils import logging
@@ -89,9 +89,7 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
-def _expand_mask(
-    mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
-):
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
     """
@@ -141,7 +139,7 @@ def __init__(
         assert (
             self.head_dim * num_heads == self.embed_dim
         ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
-        self.scaling = self.head_dim ** -0.5
+        self.scaling = self.head_dim**-0.5
         self.is_decoder = is_decoder
 
         self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
@@ -313,7 +311,9 @@ def forward(
         hidden_states = residual + hidden_states
         hidden_states = self.final_layer_norm(hidden_states)
 
-        if hidden_states.dtype == torch.float16 and (torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()):
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
@@ -480,7 +480,7 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
-    
+
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (TimeSeriesTransformerDecoder, TimeSeriesTransformerEncoder)):
             module.gradient_checkpointing = value
@@ -964,7 +964,9 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
-        attention_mask = self._prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds, past_key_values_length)
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
 
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
@@ -1004,7 +1006,9 @@ def forward(
             if self.gradient_checkpointing and self.training:
 
                 if use_cache:
-                    logger.warning("`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`...")
+                    logger.warning(
+                        "`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`..."
+                    )
                     use_cache = False
 
                 def create_custom_forward(module):
@@ -1181,7 +1185,8 @@ def forward(
 
 
 @add_start_docstrings(
-    "The TimeSeriesTransformer Model with a language modeling head. Can be used for summarization.", TIME_SERIES_TRANSFORMER_START_DOCSTRING
+    "The TimeSeriesTransformer Model with a language modeling head. Can be used for summarization.",
+    TIME_SERIES_TRANSFORMER_START_DOCSTRING,
 )
 class TimeSeriesTransformerForConditionalGeneration(TimeSeriesTransformerPreTrainedModel):
     base_model_prefix = "model"
@@ -1272,8 +1277,7 @@ def forward(
         >>> values, predictions = probs.topk(5)
 
         >>> tokenizer.decode(predictions).split()
-        ```
-"""
+        ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if labels is not None:
@@ -1281,7 +1285,9 @@ def forward(
                 logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
             use_cache = False
             if decoder_input_ids is None:
-                decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
 
         outputs = self.model(
             input_ids,
@@ -1591,6 +1597,7 @@ def forward(
             encoder_attentions=outputs.encoder_attentions,
         )
 
+
 class TimeSeriesTransformerDecoderWrapper(TimeSeriesTransformerPreTrainedModel):
     """
     This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
@@ -1728,8 +1735,7 @@ def forward(
         >>> outputs = model(**inputs)
 
         >>> logits = outputs.logits
-        ```
-"""
+        ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (

From e078e09e9a710e8e70772be802700b14f68c58f6 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 1 Jul 2022 15:45:59 +0200
Subject: [PATCH 007/164] init the transformer

---
 .../configuration_time_series_transformer.py  |  4 +++
 .../modeling_time_series_transformer.py       | 29 +++++++++++++++----
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 41f71af828c61..60658aace467e 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -49,6 +49,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             The context length for the encoder. If  `None`, the context length will be the same as the prediction length.
         distr_output (`DistributionOutput` default to `StudentTOutput()`):
             The distribution emission head for the model.
+        input_size (`int` default to 1):
+            The size of the target variable which by default is 1 for univariate targets.
         scaling (`bool` default to `True`):
             Whether to scale the input targets.
         freq (`str`, *optional* default to `None`):
@@ -102,6 +104,7 @@ def __init__(
         prediction_length,
         context_length: Optional[int] = None,
         freq: Optional[str] = None,
+        input_size: int = 1,
         distr_output: DistributionOutput = StudentTOutput(),
         lags_seq: Optional[List[int]] = None,
         time_features: Optional[List[TimeFeature]] = None,
@@ -124,6 +127,7 @@ def __init__(
         self.context_length = context_length or prediction_length
         self.prediction_length = prediction_length
         self.distr_output = distr_output
+        self.input_size = input_size
         self.time_features = time_features or time_features_from_frequency_str(freq)
         self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=freq)
         self.scaling = scaling
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 156d76a2fe34c..29cd92d107262 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1077,14 +1077,33 @@ def custom_forward(*inputs):
     TIME_SERIES_TRANSFORMER_START_DOCSTRING,
 )
 class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
+    @property
+    def _number_of_features(self) -> int:
+        return (
+            sum(self.embedding_dimension)
+            + self.num_feat_dynamic_real
+            + 1
+            + len(self.time_features)
+            + max(1, self.num_feat_static_real)
+            + 1  # the log(scale)
+        )
+
     def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
 
-        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
-        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
-
-        self.encoder = TimeSeriesTransformerEncoder(config, self.shared)
-        self.decoder = TimeSeriesTransformerDecoder(config, self.shared)
+        self.d_model = self.input_size * len(self.lags_seq) + self._number_of_features
+
+        # transformer enc-decoder and mask initializer
+        self.transformer = nn.Transformer(
+            d_model=self.d_model,
+            nhead=self.nhead,
+            num_encoder_layers=self.encoder_layers,
+            num_decoder_layers=self.decoder_layers,
+            dim_feedforward=self.ffn_dim,
+            dropout=self.dropout,
+            activation=self.activation_function,
+            batch_first=True,
+        )
 
         # Initialize weights and apply final processing
         self.post_init()

From 972cc89cad9a0f9d9620c7ab25fdb18d6361814a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 1 Jul 2022 17:56:39 +0200
Subject: [PATCH 008/164] added initial model forward to return dec_output

---
 .../configuration_time_series_transformer.py  |   2 +-
 .../modeling_time_series_transformer.py       | 227 ++++++++++++------
 2 files changed, 158 insertions(+), 71 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 60658aace467e..a92ad7255a587 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -135,7 +135,7 @@ def __init__(
         self.num_feat_static_cat = num_feat_static_cat
         self.num_feat_static_real = num_feat_static_real
         self.cardinality = cardinality if cardinality and num_feat_static_cat > 0 else [1]
-        self.embedding_dimension = embedding_dimension
+        self.embedding_dimension = embedding_dimension or [min(50, (cat + 1) // 2) for cat in self.cardinality]
 
         # Transformer architecture parameters
         self.nhead = nhead
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 29cd92d107262..299afd28f106c 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -24,6 +24,10 @@
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
+from gluonts.torch.modules.scaler import MeanScaler, NOPScaler
+from gluonts.torch.modules.feature import FeatureEmbedder
+
+
 from ...activations import ACT2FN
 from ...utils import (
     add_code_sample_docstrings,
@@ -1091,6 +1095,16 @@ def _number_of_features(self) -> int:
     def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
 
+        if self.scaling:
+            self.scaler = MeanScaler(dim=1, keepdim=True)
+        else:
+            self.scaler = NOPScaler(dim=1, keepdim=True)
+
+        self.embedder = FeatureEmbedder(
+            cardinalities=self.cardinality,
+            embedding_dims=self.embedding_dimension,
+        )
+
         self.d_model = self.input_size * len(self.lags_seq) + self._number_of_features
 
         # transformer enc-decoder and mask initializer
@@ -1108,6 +1122,117 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
+    @property
+    def _past_length(self) -> int:
+        return self.context_length + max(self.lags_seq)
+
+    def get_lagged_subsequences(
+        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
+    ) -> torch.Tensor:
+        """
+        Returns lagged subsequences of a given sequence.
+        Parameters
+        ----------
+        sequence : Tensor
+            the sequence from which lagged subsequences should be extracted.
+            Shape: (N, T, C).
+        subsequences_length : int
+            length of the subsequences to be extracted.
+        shift: int
+            shift the lags by this amount back.
+        Returns
+        --------
+        lagged : Tensor
+            a tensor of shape (N, S, C, I), where S = subsequences_length and
+            I = len(indices), containing lagged subsequences. Specifically,
+            lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
+        """
+        sequence_length = sequence.shape[1]
+        indices = [lag - shift for lag in self.lags_seq]
+
+        assert max(indices) + subsequences_length <= sequence_length, (
+            f"lags cannot go further than history length, found lag {max(indices)} "
+            f"while history length is only {sequence_length}"
+        )
+
+        lagged_values = []
+        for lag_index in indices:
+            begin_index = -lag_index - subsequences_length
+            end_index = -lag_index if lag_index > 0 else None
+            lagged_values.append(sequence[:, begin_index:end_index, ...])
+        return torch.stack(lagged_values, dim=-1)
+
+    def create_network_inputs(
+        self,
+        feat_static_cat: torch.Tensor,
+        feat_static_real: torch.Tensor,
+        past_time_feat: torch.Tensor,
+        past_target: torch.Tensor,
+        past_observed_values: torch.Tensor,
+        future_time_feat: Optional[torch.Tensor] = None,
+        future_target: Optional[torch.Tensor] = None,
+    ):
+        # time feature
+        time_feat = (
+            torch.cat(
+                (
+                    past_time_feat[:, self._past_length - self.context_length :, ...],
+                    future_time_feat,
+                ),
+                dim=1,
+            )
+            if future_target is not None
+            else past_time_feat[:, self._past_length - self.context_length :, ...]
+        )
+
+        # target
+        context = past_target[:, -self.context_length :]
+        observed_context = past_observed_values[:, -self.context_length :]
+        _, scale = self.scaler(context, observed_context)
+
+        inputs = (
+            torch.cat((past_target, future_target), dim=1) / scale
+            if future_target is not None
+            else past_target / scale
+        )
+
+        inputs_length = self._past_length + self.prediction_length if future_target is not None else self._past_length
+        assert inputs.shape[1] == inputs_length
+
+        subsequences_length = (
+            self.context_length + self.prediction_length if future_target is not None else self.context_length
+        )
+
+        # embeddings
+        embedded_cat = self.embedder(feat_static_cat)
+        static_feat = torch.cat(
+            (embedded_cat, feat_static_real, scale.log()),
+            dim=1,
+        )
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
+
+        features = torch.cat((expanded_static_feat, time_feat), dim=-1)
+
+        # sequence = torch.cat((prior_input, inputs), dim=1)
+        lagged_sequence = self.get_lagged_subsequences(
+            sequence=inputs,
+            subsequences_length=subsequences_length,
+        )
+
+        lags_shape = lagged_sequence.shape
+        reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+
+        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
+
+        return transformer_inputs, scale, static_feat
+
+    def output_params(self, transformer_inputs):
+        enc_input = transformer_inputs[:, : self.context_length, ...]
+        dec_input = transformer_inputs[:, self.context_length :, ...]
+
+        enc_out = self.transformer.encoder(enc_input)
+        return self.transformer.decoder(dec_input, enc_out, tgt_mask=self.tgt_mask)
+
     def get_input_embeddings(self):
         return self.shared
 
@@ -1117,10 +1242,10 @@ def set_input_embeddings(self, value):
         self.decoder.embed_tokens = self.shared
 
     def get_encoder(self):
-        return self.encoder
+        return self.transformer.encoder
 
     def get_decoder(self):
-        return self.decoder
+        return self.transformer.decoder
 
     @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
@@ -1131,76 +1256,38 @@ def get_decoder(self):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
+        feat_static_cat: torch.Tensor,
+        feat_static_real: torch.Tensor,
+        past_time_feat: torch.Tensor,
+        past_target: torch.Tensor,
+        past_observed_values: torch.Tensor,
+        future_time_feat: Optional[torch.Tensor] = None,
+        future_target: Optional[torch.Tensor] = None,
+        future_observed_values: Optional[torch.Tensor] = None,
     ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if encoder_outputs is None:
-            encoder_outputs = self.encoder(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                head_mask=head_mask,
-                inputs_embeds=inputs_embeds,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-            )
-
-        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
-        decoder_outputs = self.decoder(
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            encoder_hidden_states=encoder_outputs[0],
-            encoder_attention_mask=attention_mask,
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        if not return_dict:
-            return decoder_outputs + encoder_outputs
-
-        return Seq2SeqModelOutput(
-            last_hidden_state=decoder_outputs.last_hidden_state,
-            past_key_values=decoder_outputs.past_key_values,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
+        transformer_inputs, scale, _ = self.create_network_inputs(
+            feat_static_cat,
+            feat_static_real,
+            past_time_feat,
+            past_target,
+            past_observed_values,
+            future_time_feat,
+            future_target,
         )
+        dec_output = self.output_params(transformer_inputs)
+
+        return dec_output
+
+        # return Seq2SeqModelOutput(
+        #     last_hidden_state=decoder_outputs.last_hidden_state,
+        #     past_key_values=decoder_outputs.past_key_values,
+        #     decoder_hidden_states=decoder_outputs.hidden_states,
+        #     decoder_attentions=decoder_outputs.attentions,
+        #     cross_attentions=decoder_outputs.cross_attentions,
+        #     encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+        #     encoder_hidden_states=encoder_outputs.hidden_states,
+        #     encoder_attentions=encoder_outputs.attentions,
+        # )
 
 
 @add_start_docstrings(

From cf7460887700f0285256592432ab5493e29da31d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Sun, 3 Jul 2022 10:46:47 +0200
Subject: [PATCH 009/164] require gluonts

---
 setup.cfg | 1 +
 setup.py  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/setup.cfg b/setup.cfg
index 2d605ccceca78..cf559369bfd09 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -15,6 +15,7 @@ known_third_party =
     fire
     fugashi
     git
+    gluonts
     h5py
     matplotlib
     nltk
diff --git a/setup.py b/setup.py
index ea8037d4bbd3a..a1b4940fc97e2 100644
--- a/setup.py
+++ b/setup.py
@@ -114,6 +114,7 @@
     "ftfy",
     "fugashi>=1.0",
     "GitPython<3.1.19",
+    "gluonts>=0.10.0",
     "hf-doc-builder>=0.3.0",
     "huggingface-hub>=0.1.0,<1.0",
     "importlib_metadata",

From c7b8158ca063337026db4c2c35aa64fc646fc628 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Sun, 3 Jul 2022 11:15:16 +0200
Subject: [PATCH 010/164] update dep. ver table and add as extra

---
 setup.py                                      | 4 ++++
 src/transformers/dependency_versions_table.py | 1 +
 2 files changed, 5 insertions(+)

diff --git a/setup.py b/setup.py
index a1b4940fc97e2..981c74f972aaa 100644
--- a/setup.py
+++ b/setup.py
@@ -278,6 +278,8 @@ def run(self):
 extras["vision"] = deps_list("Pillow")
 extras["timm"] = deps_list("timm")
 extras["codecarbon"] = deps_list("codecarbon")
+extras["glutons"] = deps_list("glutons")
+
 
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["testing"] = (
@@ -320,6 +322,7 @@ def run(self):
     + extras["timm"]
     + extras["codecarbon"]
     + extras["accelerate"]
+    + extras["glutons"]
 )
 
 # Might need to add doc-builder and some specific deps in the future
@@ -344,6 +347,7 @@ def run(self):
     + extras["sklearn"]
     + extras["modelcreation"]
     + extras["onnxruntime"]
+    + extras["gluonts"]
 )
 extras["dev-tensorflow"] = (
     extras["testing"]
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index d63b79ababb50..7a9144bc7f860 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -20,6 +20,7 @@
     "ftfy": "ftfy",
     "fugashi": "fugashi>=1.0",
     "GitPython": "GitPython<3.1.19",
+    "gluonts": "gluonts>=0.10.0",
     "hf-doc-builder": "hf-doc-builder>=0.3.0",
     "huggingface-hub": "huggingface-hub>=0.1.0,<1.0",
     "importlib_metadata": "importlib_metadata",

From dba022c2f130d6f597510b143c7803331ad9a8f5 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Sun, 3 Jul 2022 11:33:34 +0200
Subject: [PATCH 011/164] fixed typo

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 981c74f972aaa..ceacca5af77c9 100644
--- a/setup.py
+++ b/setup.py
@@ -278,7 +278,7 @@ def run(self):
 extras["vision"] = deps_list("Pillow")
 extras["timm"] = deps_list("timm")
 extras["codecarbon"] = deps_list("codecarbon")
-extras["glutons"] = deps_list("glutons")
+extras["gluonts"] = deps_list("gluonts")
 
 
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
@@ -322,7 +322,7 @@ def run(self):
     + extras["timm"]
     + extras["codecarbon"]
     + extras["accelerate"]
-    + extras["glutons"]
+    + extras["gluonts"]
 )
 
 # Might need to add doc-builder and some specific deps in the future

From 29bcc72c64d4d3bea5ed21e59835e1550a322847 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 4 Jul 2022 11:04:31 +0200
Subject: [PATCH 012/164] add type for prediction_length

---
 .../configuration_time_series_transformer.py                  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index a92ad7255a587..b3296c899be43 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -101,7 +101,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
 
     def __init__(
         self,
-        prediction_length,
+        prediction_length: int,
         context_length: Optional[int] = None,
         freq: Optional[str] = None,
         input_size: int = 1,
@@ -124,8 +124,8 @@ def __init__(
         init_std: float = 0.02,
         **kwargs
     ):
-        self.context_length = context_length or prediction_length
         self.prediction_length = prediction_length
+        self.context_length = context_length or prediction_length
         self.distr_output = distr_output
         self.input_size = input_size
         self.time_features = time_features or time_features_from_frequency_str(freq)

From 6f3f627ba9a9c8adfa20243e82dcbecf3745f9b5 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 4 Jul 2022 11:25:45 +0200
Subject: [PATCH 013/164] use num_time_features

---
 .../configuration_time_series_transformer.py  | 23 +++++++++----------
 .../modeling_time_series_transformer.py       |  3 +--
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index b3296c899be43..646d9955347ef 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -15,8 +15,7 @@
 """ TimeSeriesTransformer model configuration """
 from typing import List, Optional
 
-from gluonts.time_feature import TimeFeature, time_features_from_frequency_str, get_lags_for_frequency
-from gluonts.torch.distributions import DistributionOutput, StudentTOutput
+from gluonts.time_feature import get_lags_for_frequency
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -57,8 +56,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             The frequency of the input time series. If `None`, the `lag_seq` and `time_features` must be provided.
         lags_seq (`list` of `int`, *optional* default to `None`):
             The lags of the input time series. Cannot be `None` if `freq` is `None`.
-        time_features (`list` of `TimeFeature`, *optional* default to `None`):
-            The time features transformations to apply to the input time series. Cannot be `None` if `freq` is `None`.
+        num_time_features (`int` default to 1):
+            The number of time features.
         num_feat_dynamic_real (`int`, *optional* default to `0`):
             The number of dynamic real valued features.
         num_feat_static_cat (`int`, *optional* default to `0`):
@@ -101,13 +100,13 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
 
     def __init__(
         self,
-        prediction_length: int,
-        context_length: Optional[int] = None,
-        freq: Optional[str] = None,
+        prediction_length: Optional[int] = 24,
+        freq: Optional[str] = "1D",
         input_size: int = 1,
-        distr_output: DistributionOutput = StudentTOutput(),
+        context_length: Optional[int] = None,
+        # distr_output: DistributionOutput = StudentTOutput(),
         lags_seq: Optional[List[int]] = None,
-        time_features: Optional[List[TimeFeature]] = None,
+        num_time_features: int = 1,
         scaling: bool = True,
         num_feat_dynamic_real: int = 0,
         num_feat_static_cat: int = 0,
@@ -124,15 +123,15 @@ def __init__(
         init_std: float = 0.02,
         **kwargs
     ):
+        # time series specific parameters
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
-        self.distr_output = distr_output
+        # self.distr_output = distr_output
         self.input_size = input_size
-        self.time_features = time_features or time_features_from_frequency_str(freq)
+        self.num_time_features = num_time_features
         self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=freq)
         self.scaling = scaling
         self.num_feat_dynamic_real = num_feat_dynamic_real
-        self.num_feat_static_cat = num_feat_static_cat
         self.num_feat_static_real = num_feat_static_real
         self.cardinality = cardinality if cardinality and num_feat_static_cat > 0 else [1]
         self.embedding_dimension = embedding_dimension or [min(50, (cat + 1) // 2) for cat in self.cardinality]
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 299afd28f106c..1f666b7e5cbf4 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1086,8 +1086,7 @@ def _number_of_features(self) -> int:
         return (
             sum(self.embedding_dimension)
             + self.num_feat_dynamic_real
-            + 1
-            + len(self.time_features)
+            + self.num_time_features
             + max(1, self.num_feat_static_real)
             + 1  # the log(scale)
         )

From 97fbcf49895e4c7f4aff57f0b443c5d4f77f1dc7 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 4 Jul 2022 11:51:07 +0200
Subject: [PATCH 014/164] use config

---
 .../modeling_time_series_transformer.py       | 42 ++++++++++---------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 1f666b7e5cbf4..ece9ef64b05fa 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1094,14 +1094,14 @@ def _number_of_features(self) -> int:
     def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
 
-        if self.scaling:
+        if self.config.scaling:
             self.scaler = MeanScaler(dim=1, keepdim=True)
         else:
             self.scaler = NOPScaler(dim=1, keepdim=True)
 
         self.embedder = FeatureEmbedder(
-            cardinalities=self.cardinality,
-            embedding_dims=self.embedding_dimension,
+            cardinalities=self.config.cardinality,
+            embedding_dims=self.config.embedding_dimension,
         )
 
         self.d_model = self.input_size * len(self.lags_seq) + self._number_of_features
@@ -1109,12 +1109,12 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         # transformer enc-decoder and mask initializer
         self.transformer = nn.Transformer(
             d_model=self.d_model,
-            nhead=self.nhead,
-            num_encoder_layers=self.encoder_layers,
-            num_decoder_layers=self.decoder_layers,
-            dim_feedforward=self.ffn_dim,
-            dropout=self.dropout,
-            activation=self.activation_function,
+            nhead=self.config.nhead,
+            num_encoder_layers=self.config.encoder_layers,
+            num_decoder_layers=self.config.decoder_layers,
+            dim_feedforward=self.config.ffn_dim,
+            dropout=self.config.config.dropout,
+            activation=self.config.activation_function,
             batch_first=True,
         )
 
@@ -1123,7 +1123,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
 
     @property
     def _past_length(self) -> int:
-        return self.context_length + max(self.lags_seq)
+        return self.config.context_length + max(self.config.lags_seq)
 
     def get_lagged_subsequences(
         self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
@@ -1147,7 +1147,7 @@ def get_lagged_subsequences(
             lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
         """
         sequence_length = sequence.shape[1]
-        indices = [lag - shift for lag in self.lags_seq]
+        indices = [lag - shift for lag in self.config.lags_seq]
 
         assert max(indices) + subsequences_length <= sequence_length, (
             f"lags cannot go further than history length, found lag {max(indices)} "
@@ -1175,18 +1175,18 @@ def create_network_inputs(
         time_feat = (
             torch.cat(
                 (
-                    past_time_feat[:, self._past_length - self.context_length :, ...],
+                    past_time_feat[:, self._past_length - self.config.context_length :, ...],
                     future_time_feat,
                 ),
                 dim=1,
             )
             if future_target is not None
-            else past_time_feat[:, self._past_length - self.context_length :, ...]
+            else past_time_feat[:, self._past_length - self.config.context_length :, ...]
         )
 
         # target
-        context = past_target[:, -self.context_length :]
-        observed_context = past_observed_values[:, -self.context_length :]
+        context = past_target[:, -self.config.context_length :]
+        observed_context = past_observed_values[:, -self.config.context_length :]
         _, scale = self.scaler(context, observed_context)
 
         inputs = (
@@ -1195,11 +1195,15 @@ def create_network_inputs(
             else past_target / scale
         )
 
-        inputs_length = self._past_length + self.prediction_length if future_target is not None else self._past_length
+        inputs_length = (
+            self._past_length + self.config.prediction_length if future_target is not None else self._past_length
+        )
         assert inputs.shape[1] == inputs_length
 
         subsequences_length = (
-            self.context_length + self.prediction_length if future_target is not None else self.context_length
+            self.config.context_length + self.confiug.prediction_length
+            if future_target is not None
+            else self.config.context_length
         )
 
         # embeddings
@@ -1226,8 +1230,8 @@ def create_network_inputs(
         return transformer_inputs, scale, static_feat
 
     def output_params(self, transformer_inputs):
-        enc_input = transformer_inputs[:, : self.context_length, ...]
-        dec_input = transformer_inputs[:, self.context_length :, ...]
+        enc_input = transformer_inputs[:, : self.config.context_length, ...]
+        dec_input = transformer_inputs[:, self.config.context_length :, ...]
 
         enc_out = self.transformer.encoder(enc_input)
         return self.transformer.decoder(dec_input, enc_out, tgt_mask=self.tgt_mask)

From 62286aab47a68125f67dfb19ad4b6eb5195c37bc Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 4 Jul 2022 11:53:51 +0200
Subject: [PATCH 015/164] more config

---
 .../time_series_transformer/modeling_time_series_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index ece9ef64b05fa..5f114333c8eb2 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1104,7 +1104,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
             embedding_dims=self.config.embedding_dimension,
         )
 
-        self.d_model = self.input_size * len(self.lags_seq) + self._number_of_features
+        self.d_model = self.config.input_size * len(self.config.lags_seq) + self._number_of_features
 
         # transformer enc-decoder and mask initializer
         self.transformer = nn.Transformer(

From 88e071bede0e5f6e5d4afa3bb9c528adfca6d6fd Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 4 Jul 2022 11:59:55 +0200
Subject: [PATCH 016/164] typos

---
 .../modeling_time_series_transformer.py                | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 5f114333c8eb2..25562c7f43710 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1084,10 +1084,10 @@ class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
     @property
     def _number_of_features(self) -> int:
         return (
-            sum(self.embedding_dimension)
-            + self.num_feat_dynamic_real
-            + self.num_time_features
-            + max(1, self.num_feat_static_real)
+            sum(self.config.embedding_dimension)
+            + self.config.num_feat_dynamic_real
+            + self.config.num_time_features
+            + self.config.num_feat_static_real
             + 1  # the log(scale)
         )
 
@@ -1113,7 +1113,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
             num_encoder_layers=self.config.encoder_layers,
             num_decoder_layers=self.config.decoder_layers,
             dim_feedforward=self.config.ffn_dim,
-            dropout=self.config.config.dropout,
+            dropout=self.config.dropout,
             activation=self.config.activation_function,
             batch_first=True,
         )

From 3e01a69a9bce8ce032e1c6207764a497f0497bd8 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 4 Jul 2022 12:17:53 +0200
Subject: [PATCH 017/164] opps another typo

---
 .../time_series_transformer/modeling_time_series_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 25562c7f43710..982a1f6f94b77 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1201,7 +1201,7 @@ def create_network_inputs(
         assert inputs.shape[1] == inputs_length
 
         subsequences_length = (
-            self.config.context_length + self.confiug.prediction_length
+            self.config.context_length + self.config.prediction_length
             if future_target is not None
             else self.config.context_length
         )

From 202dab066e4193abc637ba70d2388f954b4f5007 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 5 Jul 2022 13:45:05 +0200
Subject: [PATCH 018/164] freq can be none

---
 .../configuration_time_series_transformer.py     | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 646d9955347ef..2408b4698f903 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -53,11 +53,11 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
         scaling (`bool` default to `True`):
             Whether to scale the input targets.
         freq (`str`, *optional* default to `None`):
-            The frequency of the input time series. If `None`, the `lag_seq` and `time_features` must be provided.
+            The frequency of the input time series. If `None`, the `lags_seq` and `time_features` must be provided.
         lags_seq (`list` of `int`, *optional* default to `None`):
             The lags of the input time series. Cannot be `None` if `freq` is `None`.
-        num_time_features (`int` default to 1):
-            The number of time features.
+        num_time_features (`int` default to 7):
+            The number of time features which is 7 when no `freq` is specified.
         num_feat_dynamic_real (`int`, *optional* default to `0`):
             The number of dynamic real valued features.
         num_feat_static_cat (`int`, *optional* default to `0`):
@@ -100,13 +100,13 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
 
     def __init__(
         self,
-        prediction_length: Optional[int] = 24,
-        freq: Optional[str] = "1D",
         input_size: int = 1,
+        freq: Optional[str] = None,
+        prediction_length: Optional[int] = None,
         context_length: Optional[int] = None,
         # distr_output: DistributionOutput = StudentTOutput(),
         lags_seq: Optional[List[int]] = None,
-        num_time_features: int = 1,
+        num_time_features: int = 7,
         scaling: bool = True,
         num_feat_dynamic_real: int = 0,
         num_feat_static_cat: int = 0,
@@ -126,13 +126,15 @@ def __init__(
         # time series specific parameters
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
+        self.freq = freq
         # self.distr_output = distr_output
         self.input_size = input_size
         self.num_time_features = num_time_features
-        self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=freq)
+        self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=freq or "1S")
         self.scaling = scaling
         self.num_feat_dynamic_real = num_feat_dynamic_real
         self.num_feat_static_real = num_feat_static_real
+        self.num_feat_static_cat = num_feat_static_cat
         self.cardinality = cardinality if cardinality and num_feat_static_cat > 0 else [1]
         self.embedding_dimension = embedding_dimension or [min(50, (cat + 1) // 2) for cat in self.cardinality]
 

From 489d7f15142f4a4718a3af1cb436c12ae8c932ef Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 5 Jul 2022 14:01:30 +0200
Subject: [PATCH 019/164] default via transformation is 1

---
 .../configuration_time_series_transformer.py  | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 2408b4698f903..e28213f42a042 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -15,7 +15,7 @@
 """ TimeSeriesTransformer model configuration """
 from typing import List, Optional
 
-from gluonts.time_feature import get_lags_for_frequency
+from gluonts.time_feature import get_lags_for_frequency, time_features_from_frequency_str
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -53,11 +53,9 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
         scaling (`bool` default to `True`):
             Whether to scale the input targets.
         freq (`str`, *optional* default to `None`):
-            The frequency of the input time series. If `None`, the `lags_seq` and `time_features` must be provided.
+            The frequency of the input time series. If `None`, the `lags_seq` and `num_time_features` are set at the finest temporal resolution of 1 Second.
         lags_seq (`list` of `int`, *optional* default to `None`):
-            The lags of the input time series. Cannot be `None` if `freq` is `None`.
-        num_time_features (`int` default to 7):
-            The number of time features which is 7 when no `freq` is specified.
+            The lags of the input time series. If `None`, the `freq` is used to determine the lags.
         num_feat_dynamic_real (`int`, *optional* default to `0`):
             The number of dynamic real valued features.
         num_feat_static_cat (`int`, *optional* default to `0`):
@@ -104,9 +102,8 @@ def __init__(
         freq: Optional[str] = None,
         prediction_length: Optional[int] = None,
         context_length: Optional[int] = None,
-        # distr_output: DistributionOutput = StudentTOutput(),
+        # TODO distr_output: DistributionOutput = StudentTOutput(),
         lags_seq: Optional[List[int]] = None,
-        num_time_features: int = 7,
         scaling: bool = True,
         num_feat_dynamic_real: int = 0,
         num_feat_static_cat: int = 0,
@@ -126,15 +123,17 @@ def __init__(
         # time series specific parameters
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
-        self.freq = freq
+        self.freq = freq or "1S"
         # self.distr_output = distr_output
         self.input_size = input_size
-        self.num_time_features = num_time_features
-        self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=freq or "1S")
+        self.num_time_features = (
+            len(time_features_from_frequency_str(freq_str=self.freq)) + 1
+        )  # +1 for the Age feature
+        self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=self.freq)
         self.scaling = scaling
         self.num_feat_dynamic_real = num_feat_dynamic_real
-        self.num_feat_static_real = num_feat_static_real
-        self.num_feat_static_cat = num_feat_static_cat
+        self.num_feat_static_real = max(1, num_feat_static_real)
+        self.num_feat_static_cat = max(1, num_feat_static_cat)
         self.cardinality = cardinality if cardinality and num_feat_static_cat > 0 else [1]
         self.embedding_dimension = embedding_dimension or [min(50, (cat + 1) // 2) for cat in self.cardinality]
 

From d5f6eb3d1dc71f352634650ad82e50a5fdb1b70e Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 5 Jul 2022 15:36:26 +0200
Subject: [PATCH 020/164] initial transformations

---
 .../configuration_time_series_transformer.py  |   1 +
 .../modeling_time_series_transformer.py       |   1 -
 .../time_series_transformations.py            | 134 ++++++++++++++++++
 3 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 src/transformers/models/time_series_transformer/time_series_transformations.py

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index e28213f42a042..51da6441be7b4 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ TimeSeriesTransformer model configuration """
+
 from typing import List, Optional
 
 from gluonts.time_feature import get_lags_for_frequency, time_features_from_frequency_str
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 982a1f6f94b77..0f2559a020fb5 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ PyTorch TimeSeriesTransformer model. """
 
-
 import math
 import copy
 import random
diff --git a/src/transformers/models/time_series_transformer/time_series_transformations.py b/src/transformers/models/time_series_transformer/time_series_transformations.py
new file mode 100644
index 0000000000000..9ea3dbfa75734
--- /dev/null
+++ b/src/transformers/models/time_series_transformer/time_series_transformations.py
@@ -0,0 +1,134 @@
+# coding=utf-8
+# Copyright 2022 kashif and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformations for Time Series Transformers."""
+
+from typing import Optional, Tuple, List, Union
+
+import pandas as pd
+
+from gluonts.time_feature import time_features_from_frequency_str, TimeFeature
+from gluonts.dataset.field_names import FieldName
+from gluonts.transform import (
+    AddAgeFeature,
+    AddObservedValuesIndicator,
+    AddTimeFeatures,
+    AsNumpyArray,
+    Chain,
+    ExpectedNumInstanceSampler,
+    InstanceSplitter,
+    RemoveFields,
+    SetField,
+    TestSplitSampler,
+    Transformation,
+    ValidationSplitSampler,
+    VstackFeatures,
+)
+from gluonts.transform.sampler import InstanceSampler
+
+
+@lru_cache(10_000)
+def _as_period(val, freq):
+    return pd.Period(val, freq)
+
+
+# TODO
+def process_start_field(freq):
+    # FieldName.START: _as_period(data[FieldName.START], self.freq),
+    pass
+
+
+def create_transformation(config) -> Transformation:
+    remove_field_names = []
+    if config.num_feat_static_real == 0:
+        remove_field_names.append(FieldName.FEAT_STATIC_REAL)
+    if config.num_feat_dynamic_real == 0:
+        remove_field_names.append(FieldName.FEAT_DYNAMIC_REAL)
+
+    return Chain(
+        [RemoveFields(field_names=remove_field_names)]
+        + ([SetField(output_field=FieldName.FEAT_STATIC_CAT, value=[0])] if not config.num_feat_static_cat > 0 else [])
+        + (
+            [SetField(output_field=FieldName.FEAT_STATIC_REAL, value=[0.0])]
+            if not config.num_feat_static_real > 0
+            else []
+        )
+        + [
+            AsNumpyArray(
+                field=FieldName.FEAT_STATIC_CAT,
+                expected_ndim=1,
+                dtype=int,
+            ),
+            AsNumpyArray(
+                field=FieldName.FEAT_STATIC_REAL,
+                expected_ndim=1,
+            ),
+            AsNumpyArray(
+                field=FieldName.TARGET,
+                # in the following line, we add 1 for the time dimension
+                expected_ndim=config.input_size,
+            ),
+            AddObservedValuesIndicator(
+                target_field=FieldName.TARGET,
+                output_field=FieldName.OBSERVED_VALUES,
+            ),
+            AddTimeFeatures(
+                start_field=FieldName.START,
+                target_field=FieldName.TARGET,
+                output_field=FieldName.FEAT_TIME,
+                time_features=time_features_from_frequency_str(config.freq),
+                pred_length=config.prediction_length,
+            ),
+            AddAgeFeature(
+                target_field=FieldName.TARGET,
+                output_field=FieldName.FEAT_AGE,
+                pred_length=config.prediction_length,
+                log_scale=True,
+            ),
+            VstackFeatures(
+                output_field=FieldName.FEAT_TIME,
+                input_fields=[FieldName.FEAT_TIME, FieldName.FEAT_AGE]
+                + ([FieldName.FEAT_DYNAMIC_REAL] if config.num_feat_dynamic_real > 0 else []),
+            ),
+        ]
+    )
+
+
+def create_instance_splitter(
+    config,
+    mode: str,
+    train_sampler: Optional[InstanceSampler] = None,
+    validation_sampler: Optional[InstanceSampler] = None,
+) -> Transformation:
+    assert mode in ["train", "validation", "test"]
+
+    instance_sampler = {
+        "train": train_sampler or ExpectedNumInstanceSampler(num_instances=1.0, min_future=config.prediction_length),
+        "validation": validation_sampler or ValidationSplitSampler(min_future=config.prediction_length),
+        "test": TestSplitSampler(),
+    }[mode]
+
+    return InstanceSplitter(
+        target_field=FieldName.TARGET,
+        is_pad_field=FieldName.IS_PAD,
+        start_field=FieldName.START,
+        forecast_start_field=FieldName.FORECAST_START,
+        instance_sampler=instance_sampler,
+        past_length=config.context_length + max(config.lags_seq),
+        future_length=config.prediction_length,
+        time_series_fields=[
+            FieldName.FEAT_TIME,
+            FieldName.OBSERVED_VALUES,
+        ],
+    )

From 302a387a26899064bc21138e52b371599733f73d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 5 Jul 2022 15:38:47 +0200
Subject: [PATCH 021/164] fix imports

---
 .../modeling_time_series_transformer.py                        | 2 +-
 .../time_series_transformer/time_series_transformations.py     | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 0f2559a020fb5..6e95fbb692204 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -17,7 +17,7 @@
 import math
 import copy
 import random
-from typing import Optional, Tuple, List, Union
+from typing import Optional, Tuple, List
 
 import torch
 from torch import nn
diff --git a/src/transformers/models/time_series_transformer/time_series_transformations.py b/src/transformers/models/time_series_transformer/time_series_transformations.py
index 9ea3dbfa75734..2e409dc43b2f5 100644
--- a/src/transformers/models/time_series_transformer/time_series_transformations.py
+++ b/src/transformers/models/time_series_transformer/time_series_transformations.py
@@ -14,7 +14,8 @@
 # limitations under the License.
 """Transformations for Time Series Transformers."""
 
-from typing import Optional, Tuple, List, Union
+from typing import Optional, List
+from functools import lru_cache
 
 import pandas as pd
 

From 669aeeb21fcbbc83c1bf43a60e7250cc6e797ef3 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 5 Jul 2022 17:54:54 +0200
Subject: [PATCH 022/164] added transform_start_field

---
 .../time_series_transformer/time_series_transformations.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/time_series_transformations.py b/src/transformers/models/time_series_transformer/time_series_transformations.py
index 2e409dc43b2f5..102371f199272 100644
--- a/src/transformers/models/time_series_transformer/time_series_transformations.py
+++ b/src/transformers/models/time_series_transformer/time_series_transformations.py
@@ -44,10 +44,9 @@ def _as_period(val, freq):
     return pd.Period(val, freq)
 
 
-# TODO
-def process_start_field(freq):
-    # FieldName.START: _as_period(data[FieldName.START], self.freq),
-    pass
+def transform_start_field(batch, freq):
+    batch[FieldName.START] = [_as_period(entry, freq) for entry in batch[FieldName.START]]
+    return batch
 
 
 def create_transformation(config) -> Transformation:

From 6c339d888333a107d04edc3d6c04149d36c5c3ad Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 6 Jul 2022 18:02:00 +0200
Subject: [PATCH 023/164] add helper to create pytorch dataloader

---
 .../time_series_transformations.py            | 60 +++++++++++++++++--
 1 file changed, 56 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/time_series_transformations.py b/src/transformers/models/time_series_transformer/time_series_transformations.py
index 102371f199272..c0cd5d8ef4a61 100644
--- a/src/transformers/models/time_series_transformer/time_series_transformations.py
+++ b/src/transformers/models/time_series_transformer/time_series_transformations.py
@@ -12,14 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformations for Time Series Transformers."""
+""" Transformations for Time Series Transformers. """
 
-from typing import Optional, List
+from typing import Optional, List, Iterable
 from functools import lru_cache
 
 import pandas as pd
 
-from gluonts.time_feature import time_features_from_frequency_str, TimeFeature
+from torch.utils.data import DataLoader
+
+from gluonts.time_feature import time_features_from_frequency_str
 from gluonts.dataset.field_names import FieldName
 from gluonts.transform import (
     AddAgeFeature,
@@ -30,6 +32,7 @@
     ExpectedNumInstanceSampler,
     InstanceSplitter,
     RemoveFields,
+    SelectFields,
     SetField,
     TestSplitSampler,
     Transformation,
@@ -37,6 +40,8 @@
     VstackFeatures,
 )
 from gluonts.transform.sampler import InstanceSampler
+from gluonts.itertools import Cyclic, IterableSlice, PseudoShuffled
+from gluonts.torch.util import IterableDataset
 
 
 @lru_cache(10_000)
@@ -76,7 +81,6 @@ def create_transformation(config) -> Transformation:
             ),
             AsNumpyArray(
                 field=FieldName.TARGET,
-                # in the following line, we add 1 for the time dimension
                 expected_ndim=config.input_size,
             ),
             AddObservedValuesIndicator(
@@ -132,3 +136,51 @@ def create_instance_splitter(
             FieldName.OBSERVED_VALUES,
         ],
     )
+
+
+def create_training_data_loader(
+    config,
+    data,
+    batch_size: int,
+    num_batches_per_epoch: int,
+    shuffle_buffer_length: Optional[int] = None,
+    **kwargs,
+) -> Iterable:
+    PREDICTION_INPUT_NAMES = [
+        FieldName.FEAT_STATIC_CAT,
+        FieldName.FEAT_STATIC_REAL,
+        "past_" + FieldName.FEAT_TIME,
+        "past_" + FieldName.TARGET,
+        "past_" + FieldName.OBSERVED_VALUES,
+        "future_" + FieldName.FEAT_TIME,
+    ]
+
+    TRAINING_INPUT_NAMES = PREDICTION_INPUT_NAMES + [
+        "future_" + FieldName.TARGET,
+        "future_" + FieldName.OBSERVED_VALUES,
+    ]
+
+    transformation = create_transformation(config)
+    transformed_data = transformation.apply(data, is_train=True)
+
+    instance_splitter = create_instance_splitter(config, "train") + SelectFields(TRAINING_INPUT_NAMES)
+
+    training_instances = instance_splitter.apply(
+        Cyclic(transformed_data)
+        if shuffle_buffer_length is None
+        else PseudoShuffled(
+            Cyclic(transformed_data),
+            shuffle_buffer_length=shuffle_buffer_length,
+        )
+    )
+
+    return IterableSlice(
+        iter(
+            DataLoader(
+                IterableDataset(training_instances),
+                batch_size=batch_size,
+                **kwargs,
+            )
+        ),
+        num_batches_per_epoch,
+    )

From 43ffcf19a6fb06728e6889b0d1b6bc45deb22b8d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 7 Jul 2022 10:24:24 +0200
Subject: [PATCH 024/164] added inital val and test data loader

---
 .../configuration_time_series_transformer.py  |  5 +-
 .../time_series_transformations.py            | 61 ++++++++++++++++++-
 2 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 51da6441be7b4..96551b9e58697 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ TimeSeriesTransformer model configuration """
-
 from typing import List, Optional
 
 from gluonts.time_feature import get_lags_for_frequency, time_features_from_frequency_str
@@ -133,8 +132,8 @@ def __init__(
         self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=self.freq)
         self.scaling = scaling
         self.num_feat_dynamic_real = num_feat_dynamic_real
-        self.num_feat_static_real = max(1, num_feat_static_real)
-        self.num_feat_static_cat = max(1, num_feat_static_cat)
+        self.num_feat_static_real = num_feat_static_real  # there is at least one dummy static real feature
+        self.num_feat_static_cat = num_feat_static_cat  # there is at least one dummy static categorical feature
         self.cardinality = cardinality if cardinality and num_feat_static_cat > 0 else [1]
         self.embedding_dimension = embedding_dimension or [min(50, (cat + 1) // 2) for cat in self.cardinality]
 
diff --git a/src/transformers/models/time_series_transformer/time_series_transformations.py b/src/transformers/models/time_series_transformer/time_series_transformations.py
index c0cd5d8ef4a61..b3486485e880c 100644
--- a/src/transformers/models/time_series_transformer/time_series_transformations.py
+++ b/src/transformers/models/time_series_transformer/time_series_transformations.py
@@ -171,7 +171,8 @@ def create_training_data_loader(
         else PseudoShuffled(
             Cyclic(transformed_data),
             shuffle_buffer_length=shuffle_buffer_length,
-        )
+        ),
+        is_train=True,
     )
 
     return IterableSlice(
@@ -184,3 +185,61 @@ def create_training_data_loader(
         ),
         num_batches_per_epoch,
     )
+
+
+def create_validation_data_loader(
+    config,
+    data,
+    batch_size,
+    **kwargs,
+):
+    PREDICTION_INPUT_NAMES = [
+        FieldName.FEAT_STATIC_CAT,
+        FieldName.FEAT_STATIC_REAL,
+        "past_" + FieldName.FEAT_TIME,
+        "past_" + FieldName.TARGET,
+        "past_" + FieldName.OBSERVED_VALUES,
+        "future_" + FieldName.FEAT_TIME,
+    ]
+
+    TRAINING_INPUT_NAMES = PREDICTION_INPUT_NAMES + [
+        "future_" + FieldName.TARGET,
+        "future_" + FieldName.OBSERVED_VALUES,
+    ]
+    transformation = create_transformation(config)
+    transformed_data = transformation.apply(data, is_train=True)
+
+    instance_splitter = create_instance_splitter(config, "validation") + SelectFields(TRAINING_INPUT_NAMES)
+    validation_instances = instance_splitter.apply(transformed_data, is_train=True)
+
+    return DataLoader(
+        IterableDataset(validation_instances),
+        batch_size=batch_size,
+        **kwargs,
+    )
+
+
+def create_test_data_loader(
+    config,
+    data,
+    batch_size,
+    **kwargs,
+):
+    PREDICTION_INPUT_NAMES = [
+        FieldName.FEAT_STATIC_CAT,
+        FieldName.FEAT_STATIC_REAL,
+        "past_" + FieldName.FEAT_TIME,
+        "past_" + FieldName.TARGET,
+        "past_" + FieldName.OBSERVED_VALUES,
+        "future_" + FieldName.FEAT_TIME,
+    ]
+    transformation = create_transformation(config)
+    transformed_data = transformation.apply(data, is_train=False)
+    instance_splitter = create_instance_splitter(config, "test") + SelectFields(PREDICTION_INPUT_NAMES)
+    test_instances = instance_splitter.apply(transformed_data, is_tran=False)
+
+    return DataLoader(
+        IterableDataset(test_instances),
+        batch_size=batch_size,
+        **kwargs,
+    )

From 1525a1a07ba53d5ba408e141562c0ee54ff07766 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 7 Jul 2022 11:33:12 +0200
Subject: [PATCH 025/164] added initial distr head and loss

---
 .../configuration_time_series_transformer.py  | 10 ++-
 .../modeling_time_series_transformer.py       | 61 ++++++++++++++++++-
 2 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 96551b9e58697..16a4fd57b19a4 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -46,8 +46,10 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             The prediction horizon for the model.
         context_length (`int`, *optional*, default to `None`):
             The context length for the encoder. If  `None`, the context length will be the same as the prediction length.
-        distr_output (`DistributionOutput` default to `StudentTOutput()`):
+        distr_output (`string` default to `StudentT`):
             The distribution emission head for the model.
+        loss (`string` default to `NLL`):
+            The loss function for the model with corresponding to the `distr_output` head.
         input_size (`int` default to 1):
             The size of the target variable which by default is 1 for univariate targets.
         scaling (`bool` default to `True`):
@@ -102,7 +104,8 @@ def __init__(
         freq: Optional[str] = None,
         prediction_length: Optional[int] = None,
         context_length: Optional[int] = None,
-        # TODO distr_output: DistributionOutput = StudentTOutput(),
+        distr_output: str = "StudentT",
+        loss: str = "NLL",
         lags_seq: Optional[List[int]] = None,
         scaling: bool = True,
         num_feat_dynamic_real: int = 0,
@@ -124,7 +127,8 @@ def __init__(
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
         self.freq = freq or "1S"
-        # self.distr_output = distr_output
+        self.distr_output = distr_output
+        self.loss = loss
         self.input_size = input_size
         self.num_time_features = (
             len(time_features_from_frequency_str(freq_str=self.freq)) + 1
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 6e95fbb692204..6a50bab1d40d7 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -25,6 +25,9 @@
 
 from gluonts.torch.modules.scaler import MeanScaler, NOPScaler
 from gluonts.torch.modules.feature import FeatureEmbedder
+from gluonts.torch.distributions import StudentTOutput
+from gluonts.torch.modules.loss import NegativeLogLikelihood
+from gluonts.torch.util import weighted_average
 
 
 from ...activations import ACT2FN
@@ -1265,7 +1268,6 @@ def forward(
         past_observed_values: torch.Tensor,
         future_time_feat: Optional[torch.Tensor] = None,
         future_target: Optional[torch.Tensor] = None,
-        future_observed_values: Optional[torch.Tensor] = None,
     ):
         transformer_inputs, scale, _ = self.create_network_inputs(
             feat_static_cat,
@@ -1278,7 +1280,7 @@ def forward(
         )
         dec_output = self.output_params(transformer_inputs)
 
-        return dec_output
+        return dec_output, scale
 
         # return Seq2SeqModelOutput(
         #     last_hidden_state=decoder_outputs.last_hidden_state,
@@ -1292,6 +1294,61 @@ def forward(
         # )
 
 
+class TimeSeriesTransformerModelForPrediction(TimeSeriesTransformerModel):
+    def __init__(self, config: TimeSeriesTransformerConfig):
+        super().__init__(config)
+        self.config = config
+        self.transformer = TimeSeriesTransformerModel(config)
+        if config.distr_output == "StudentT":
+            self.distr_output = StudentTOutput()
+            self.param_proj = self.distr_output.get_args_proj(self.transformer.d_model)
+            self.target_shape = self.distr_output.event_shape
+
+        if config.loss == "NLL":
+            self.loss = NegativeLogLikelihood()
+
+    def output_params(self, dec_output):
+        return self.param_proj(dec_output)
+
+    @torch.jit.ignore
+    def output_distribution(self, params, scale=None, trailing_n=None) -> torch.distributions.Distribution:
+        sliced_params = params
+        if trailing_n is not None:
+            sliced_params = [p[:, -trailing_n:] for p in params]
+        return self.distr_output.distribution(sliced_params, scale=scale)
+
+    def forward(self, batch):
+        feat_static_cat = batch["feat_static_cat"]
+        feat_static_real = batch["feat_static_real"]
+        past_time_feat = batch["past_time_feat"]
+        past_target = batch["past_target"]
+        future_time_feat = batch["future_time_feat"]
+        future_target = batch["future_target"]
+        past_observed_values = batch["past_observed_values"]
+        future_observed_values = batch["future_observed_values"]
+
+        dec_output, scale = self.transformer(
+            feat_static_cat,
+            feat_static_real,
+            past_time_feat,
+            past_target,
+            past_observed_values,
+            future_time_feat,
+            future_target,
+        )
+        params = self.output_params(dec_output)
+        distr = self.output_distribution(params, scale)
+
+        loss_values = self.loss(distr, future_target)
+
+        if len(self.target_shape) == 0:
+            loss_weights = future_observed_values
+        else:
+            loss_weights = future_observed_values.min(dim=-1, keepdim=False)
+        
+        return weighted_average(loss_values, weights=loss_weights)
+
+
 @add_start_docstrings(
     "The TimeSeriesTransformer Model with a language modeling head. Can be used for summarization.",
     TIME_SERIES_TRANSFORMER_START_DOCSTRING,

From 3fa6d48f4c7d6c67186b8f15076e6c0e8dd011c0 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 7 Jul 2022 20:35:59 +0200
Subject: [PATCH 026/164] training working

---
 src/transformers/__init__.py                  |  8 ++++-
 .../time_series_transformer/__init__.py       |  2 ++
 .../configuration_time_series_transformer.py  |  4 +--
 .../modeling_time_series_transformer.py       | 12 +++++--
 .../test_modeling_time_series_transformer.py  | 32 +++++++++++++------
 5 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4840b8f120b53..6d68dfa6efe8d 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -308,7 +308,11 @@
     "models.t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config"],
     "models.tapas": ["TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP", "TapasConfig", "TapasTokenizer"],
     "models.tapex": ["TapexTokenizer"],
-    "models.time_series_transformer": ["TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimeSeriesTransformerConfig", "TimeSeriesTransformerTokenizer"],
+    "models.time_series_transformer": [
+        "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "TimeSeriesTransformerConfig",
+        "TimeSeriesTransformerTokenizer",
+    ],
     "models.trajectory_transformer": [
         "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TrajectoryTransformerConfig",
@@ -760,6 +764,7 @@
             "TimeSeriesTransformerForConditionalGeneration",
             "TimeSeriesTransformerForQuestionAnswering",
             "TimeSeriesTransformerForSequenceClassification",
+            "TimeSeriesTransformerForPrediction",
             "TimeSeriesTransformerModel",
             "TimeSeriesTransformerPreTrainedModel",
         ]
@@ -4224,6 +4229,7 @@
             TimeSeriesTransformerForConditionalGeneration,
             TimeSeriesTransformerForQuestionAnswering,
             TimeSeriesTransformerForSequenceClassification,
+            TimeSeriesTransformerForPrediction,
             TimeSeriesTransformerModel,
             TimeSeriesTransformerPreTrainedModel,
         )
diff --git a/src/transformers/models/time_series_transformer/__init__.py b/src/transformers/models/time_series_transformer/__init__.py
index ec73a76c32cc4..0eb64b484e854 100644
--- a/src/transformers/models/time_series_transformer/__init__.py
+++ b/src/transformers/models/time_series_transformer/__init__.py
@@ -50,6 +50,7 @@
         "TimeSeriesTransformerForQuestionAnswering",
         "TimeSeriesTransformerForSequenceClassification",
         "TimeSeriesTransformerForCausalLM",
+        "TimeSeriesTransformerForPrediction",
         "TimeSeriesTransformerModel",
         "TimeSeriesTransformerPreTrainedModel",
     ]
@@ -73,6 +74,7 @@
             TimeSeriesTransformerForCausalLM,
             TimeSeriesTransformerForQuestionAnswering,
             TimeSeriesTransformerForSequenceClassification,
+            TimeSeriesTransformerForPrediction,
             TimeSeriesTransformerModel,
             TimeSeriesTransformerPreTrainedModel,
         )
diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 16a4fd57b19a4..ae66bb023e15e 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -136,8 +136,8 @@ def __init__(
         self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=self.freq)
         self.scaling = scaling
         self.num_feat_dynamic_real = num_feat_dynamic_real
-        self.num_feat_static_real = num_feat_static_real  # there is at least one dummy static real feature
-        self.num_feat_static_cat = num_feat_static_cat  # there is at least one dummy static categorical feature
+        self.num_feat_static_real = num_feat_static_real
+        self.num_feat_static_cat = num_feat_static_cat
         self.cardinality = cardinality if cardinality and num_feat_static_cat > 0 else [1]
         self.embedding_dimension = embedding_dimension or [min(50, (cat + 1) // 2) for cat in self.cardinality]
 
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 6a50bab1d40d7..5cba4bd827a5e 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1089,7 +1089,7 @@ def _number_of_features(self) -> int:
             sum(self.config.embedding_dimension)
             + self.config.num_feat_dynamic_real
             + self.config.num_time_features
-            + self.config.num_feat_static_real
+            + max(1, self.config.num_feat_static_real)  # there is at least one dummy static real feature
             + 1  # the log(scale)
         )
 
@@ -1120,6 +1120,12 @@ def __init__(self, config: TimeSeriesTransformerConfig):
             batch_first=True,
         )
 
+        # causal decoder tgt mask
+        self.register_buffer(
+            "tgt_mask",
+            self.transformer.generate_square_subsequent_mask(self.config.prediction_length),
+        )
+
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1294,7 +1300,7 @@ def forward(
         # )
 
 
-class TimeSeriesTransformerModelForPrediction(TimeSeriesTransformerModel):
+class TimeSeriesTransformerForPrediction(TimeSeriesTransformerModel):
     def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
         self.config = config
@@ -1345,7 +1351,7 @@ def forward(self, batch):
             loss_weights = future_observed_values
         else:
             loss_weights = future_observed_values.min(dim=-1, keepdim=False)
-        
+
         return weighted_average(loss_values, weights=loss_weights)
 
 
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 6172a1f2ae44a..410ee4101f526 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -37,6 +37,7 @@
         TimeSeriesTransformerForQuestionAnswering,
         TimeSeriesTransformerForCausalLM,
         TimeSeriesTransformerForSequenceClassification,
+        TimeSeriesTransformerForPrediction,
         TimeSeriesTransformerModel,
         TimeSeriesTransformerTokenizer,
     )
@@ -156,7 +157,9 @@ def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
         next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
 
         output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
 
         # select random slice
         random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
@@ -204,7 +207,12 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 @require_torch
 class TimeSeriesTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (TimeSeriesTransformerModel, TimeSeriesTransformerForConditionalGeneration, TimeSeriesTransformerForSequenceClassification, TimeSeriesTransformerForQuestionAnswering)
+        (
+            TimeSeriesTransformerModel,
+            TimeSeriesTransformerForConditionalGeneration,
+            TimeSeriesTransformerForSequenceClassification,
+            TimeSeriesTransformerForQuestionAnswering,
+        )
         if is_torch_available()
         else ()
     )
@@ -243,7 +251,11 @@ def test_encoder_decoder_model_standalone(self):
     def test_inputs_embeds(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        for model_class in (TimeSeriesTransformerModel, TimeSeriesTransformerForConditionalGeneration, TimeSeriesTransformerForQuestionAnswering):
+        for model_class in (
+            TimeSeriesTransformerModel,
+            TimeSeriesTransformerForConditionalGeneration,
+            TimeSeriesTransformerForQuestionAnswering,
+        ):
             model = model_class(config)
             model.to(torch_device)
             model.eval()
@@ -313,10 +325,10 @@ def _long_tensor(tok_lst):
 class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_tokenizer(self):
-        return TimeSeriesTransformerTokenizer.from_pretrained('huggingface/tst-ett')
+        return TimeSeriesTransformerTokenizer.from_pretrained("huggingface/tst-ett")
 
     def test_inference_no_head(self):
-        model = TimeSeriesTransformerModel.from_pretrained('huggingface/tst-ett').to(torch_device)
+        model = TimeSeriesTransformerModel.from_pretrained("huggingface/tst-ett").to(torch_device)
         input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         decoder_input_ids = _long_tensor([[2, 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588]])
         inputs_dict = prepare_time_series_transformer_inputs_dict(model.config, input_ids, decoder_input_ids)
@@ -331,7 +343,7 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
 
     def test_inference_head(self):
-        model = TimeSeriesTransformerForConditionalGeneration.from_pretrained('huggingface/tst-ett').to(torch_device)
+        model = TimeSeriesTransformerForConditionalGeneration.from_pretrained("huggingface/tst-ett").to(torch_device)
 
         # change to intended input
         input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
@@ -348,8 +360,8 @@ def test_inference_head(self):
         self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
 
     def test_seq_to_seq_generation(self):
-        hf = TimeSeriesTransformerForConditionalGeneration.from_pretrained('huggingface/tst-ett').to(torch_device)
-        tok = TimeSeriesTransformerTokenizer.from_pretrained('huggingface/tst-ett')
+        hf = TimeSeriesTransformerForConditionalGeneration.from_pretrained("huggingface/tst-ett").to(torch_device)
+        tok = TimeSeriesTransformerTokenizer.from_pretrained("huggingface/tst-ett")
 
         batch_input = [
             # string 1,
@@ -576,7 +588,9 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class TimeSeriesTransformerStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (TimeSeriesTransformerDecoder, TimeSeriesTransformerForCausalLM) if is_torch_available() else ()
+    all_model_classes = (
+        (TimeSeriesTransformerDecoder, TimeSeriesTransformerForCausalLM) if is_torch_available() else ()
+    )
     all_generative_model_classes = (TimeSeriesTransformerForCausalLM,) if is_torch_available() else ()
     test_pruning = False
     is_encoder_decoder = False

From 956f3696e14cc8179c19ccc3f3d6d3e8f3a8afeb Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 8 Jul 2022 11:53:38 +0200
Subject: [PATCH 027/164] remove TimeSeriesTransformerTokenizer

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 6d68dfa6efe8d..318a4dea10d69 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -311,7 +311,6 @@
     "models.time_series_transformer": [
         "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TimeSeriesTransformerConfig",
-        "TimeSeriesTransformerTokenizer",
     ],
     "models.trajectory_transformer": [
         "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",

From 9322bb1756aac41f035275637fcc205712663042 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 8 Jul 2022 11:54:47 +0200
Subject: [PATCH 028/164] Update src/transformers/__init__.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/__init__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 318a4dea10d69..4e177f46ec60c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -759,10 +759,6 @@
     _import_structure["models.time_series_transformer"].extend(
         [
             "TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "TimeSeriesTransformerForCausalLM",
-            "TimeSeriesTransformerForConditionalGeneration",
-            "TimeSeriesTransformerForQuestionAnswering",
-            "TimeSeriesTransformerForSequenceClassification",
             "TimeSeriesTransformerForPrediction",
             "TimeSeriesTransformerModel",
             "TimeSeriesTransformerPreTrainedModel",

From 2babbe99d81ceb881968f2811b0982453e5e5fee Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 8 Jul 2022 11:55:02 +0200
Subject: [PATCH 029/164] Update
 src/transformers/models/time_series_transformer/__init__.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/time_series_transformer/__init__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/__init__.py b/src/transformers/models/time_series_transformer/__init__.py
index 0eb64b484e854..c3bab61846afe 100644
--- a/src/transformers/models/time_series_transformer/__init__.py
+++ b/src/transformers/models/time_series_transformer/__init__.py
@@ -46,10 +46,6 @@
 else:
     _import_structure["modeling_time_series_transformer"] = [
         "TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "TimeSeriesTransformerForConditionalGeneration",
-        "TimeSeriesTransformerForQuestionAnswering",
-        "TimeSeriesTransformerForSequenceClassification",
-        "TimeSeriesTransformerForCausalLM",
         "TimeSeriesTransformerForPrediction",
         "TimeSeriesTransformerModel",
         "TimeSeriesTransformerPreTrainedModel",

From 1dc5e85148d3011b2729a260e555c71704876a88 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 11 Jul 2022 18:31:58 +0100
Subject: [PATCH 030/164] fixed copyright

---
 .../configuration_time_series_transformer.py                    | 2 +-
 .../time_series_transformer/modeling_time_series_transformer.py | 2 +-
 .../time_series_transformer/time_series_transformations.py      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index ae66bb023e15e..40d32ece59571 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 kashif and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 5cba4bd827a5e..901d15b9c4fd9 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 kashif The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/time_series_transformer/time_series_transformations.py b/src/transformers/models/time_series_transformer/time_series_transformations.py
index b3486485e880c..6f88776e12c0c 100644
--- a/src/transformers/models/time_series_transformer/time_series_transformations.py
+++ b/src/transformers/models/time_series_transformer/time_series_transformations.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 kashif and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 7d6732f73540ca066fd763abee578863621c65e2 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 11 Jul 2022 18:35:46 +0100
Subject: [PATCH 031/164] removed docs

---
 .../en/model_doc/time_series_transformer.mdx  | 39 -------------------
 1 file changed, 39 deletions(-)

diff --git a/docs/source/en/model_doc/time_series_transformer.mdx b/docs/source/en/model_doc/time_series_transformer.mdx
index 4c768a8058bf2..cd309be74fdb9 100644
--- a/docs/source/en/model_doc/time_series_transformer.mdx
+++ b/docs/source/en/model_doc/time_series_transformer.mdx
@@ -31,47 +31,8 @@ This model was contributed by [INSERT YOUR HF USERNAME HERE](<https://huggingfac
 [[autodoc]] TimeSeriesTransformerConfig
 
 
-## TimeSeriesTransformerTokenizer
-
-[[autodoc]] TimeSeriesTransformerTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-
-## TimeSeriesTransformerTokenizerFast
-
-[[autodoc]] TimeSeriesTransformerTokenizerFast
-
-
 ## TimeSeriesTransformerModel
 
 [[autodoc]] TimeSeriesTransformerModel
     - forward
 
-
-## TimeSeriesTransformerForConditionalGeneration
-
-[[autodoc]] TimeSeriesTransformerForConditionalGeneration
-    - forward
-
-
-## TimeSeriesTransformerForSequenceClassification
-
-[[autodoc]] TimeSeriesTransformerForSequenceClassification
-    - forward
-
-
-## TimeSeriesTransformerForQuestionAnswering
-
-[[autodoc]] TimeSeriesTransformerForQuestionAnswering
-    - forward
-
-
-## TimeSeriesTransformerForCausalLM
-
-[[autodoc]] TimeSeriesTransformerForCausalLM
-    - forward
-
-

From e113de59294248998f017d0d6b19ab80beba0597 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 11 Jul 2022 18:39:19 +0100
Subject: [PATCH 032/164] remove time series tokenizer

---
 src/transformers/__init__.py                           |  1 -
 .../models/time_series_transformer/__init__.py         | 10 +---------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4e177f46ec60c..4c47a292f4ef2 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -509,7 +509,6 @@
     ]
 else:
     # Fast tokenizers structure
-    _import_structure["models.time_series_transformer"].append("TimeSeriesTransformerTokenizerFast")
     _import_structure["models.albert"].append("AlbertTokenizerFast")
     _import_structure["models.bart"].append("BartTokenizerFast")
     _import_structure["models.barthez"].append("BarthezTokenizerFast")
diff --git a/src/transformers/models/time_series_transformer/__init__.py b/src/transformers/models/time_series_transformer/__init__.py
index c3bab61846afe..4e55f7fb8a1f1 100644
--- a/src/transformers/models/time_series_transformer/__init__.py
+++ b/src/transformers/models/time_series_transformer/__init__.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...utils import _LazyModule, OptionalDependencyNotAvailable, is_tokenizers_available
+from ...utils import _LazyModule, OptionalDependencyNotAvailable
 from ...utils import is_torch_available
 
 
@@ -30,14 +30,6 @@
     "tokenization_time_series_transformer": ["TimeSeriesTransformerTokenizer"],
 }
 
-try:
-    if not is_tokenizers_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["tokenization_time_series_transformer_fast"] = ["TimeSeriesTransformerTokenizerFast"]
-
 try:
     if not is_torch_available():
         raise OptionalDependencyNotAvailable()

From 800b452190a97c67f9ed59d1ee71a75f445194f8 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 12 Jul 2022 07:32:52 +0100
Subject: [PATCH 033/164] fixed docs

---
 .../configuration_time_series_transformer.py     | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 40d32ece59571..eb30f527c656f 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -32,30 +32,30 @@
 class TimeSeriesTransformerConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`~TimeSeriesTransformerModel`].
-    It is used to instantiate an TimeSeriesTransformer model according to the specified arguments, defining the model
+    It is used to instantiate a TimeSeriesTransformer model according to the specified arguments, defining the model
     architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
     the TimeSeriesTransformer [huggingface/tst-ett](https://huggingface.co/huggingface/tst-ett) architecture.
 
-    Configuration objects inherit from  [`PretrainedConfig`] and can be used
-    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
-    for more information.
-
+    Configuration objects inherit from  [`PretrainedConfig`] can be used to control the model outputs.
+    Read the documentation from  [`PretrainedConfig`]  for more information.
 
     Args:
         prediction_length (`int`):
             The prediction horizon for the model.
         context_length (`int`, *optional*, default to `None`):
-            The context length for the encoder. If  `None`, the context length will be the same as the prediction length.
+            The context length for the encoder. If  `None`, the context length will be the same as the
+            `prediction_length`.
         distr_output (`string` default to `StudentT`):
             The distribution emission head for the model.
         loss (`string` default to `NLL`):
             The loss function for the model with corresponding to the `distr_output` head.
         input_size (`int` default to 1):
             The size of the target variable which by default is 1 for univariate targets.
-        scaling (`bool` default to `True`):
+        scaling (`bool`, *optional* default to `True`):
             Whether to scale the input targets.
         freq (`str`, *optional* default to `None`):
-            The frequency of the input time series. If `None`, the `lags_seq` and `num_time_features` are set at the finest temporal resolution of 1 Second.
+            The frequency of the input time series. If `None`, the `lags_seq` and `num_time_features` are set at
+            the finest temporal resolution of 1 Second.
         lags_seq (`list` of `int`, *optional* default to `None`):
             The lags of the input time series. If `None`, the `freq` is used to determine the lags.
         num_feat_dynamic_real (`int`, *optional* default to `0`):

From 16109a7c039155287289b55ebddb54842abd93a2 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 12 Jul 2022 12:30:01 +0100
Subject: [PATCH 034/164] fix text

---
 .../configuration_time_series_transformer.py  | 32 +++++++++----------
 .../modeling_time_series_transformer.py       | 12 +++----
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index eb30f527c656f..4320e9a099ed5 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -42,31 +42,31 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
     Args:
         prediction_length (`int`):
             The prediction horizon for the model.
-        context_length (`int`, *optional*, default to `None`):
+        context_length (`int`, *optional*):
             The context length for the encoder. If  `None`, the context length will be the same as the
             `prediction_length`.
-        distr_output (`string` default to `StudentT`):
+        distribution_output (`string`, *optional* defaults to `student_t`):
             The distribution emission head for the model.
-        loss (`string` default to `NLL`):
-            The loss function for the model with corresponding to the `distr_output` head.
-        input_size (`int` default to 1):
+        loss (`string`, *optional* defaults to `nll`):
+            The loss function for the model with corresponding to the `distribution_output` head.
+        input_size (`int`, *optional* defaults to 1):
             The size of the target variable which by default is 1 for univariate targets.
-        scaling (`bool`, *optional* default to `True`):
+        scaling (`bool`, *optional* defaults to `True`):
             Whether to scale the input targets.
-        freq (`str`, *optional* default to `None`):
+        freq (`str`, *optional*):
             The frequency of the input time series. If `None`, the `lags_seq` and `num_time_features` are set at
             the finest temporal resolution of 1 Second.
-        lags_seq (`list` of `int`, *optional* default to `None`):
+        lags_seq (`list` of `int`, *optional*):
             The lags of the input time series. If `None`, the `freq` is used to determine the lags.
-        num_feat_dynamic_real (`int`, *optional* default to `0`):
+        num_feat_dynamic_real (`int`, *optional* defaults to `0`):
             The number of dynamic real valued features.
-        num_feat_static_cat (`int`, *optional* default to `0`):
+        num_feat_static_cat (`int`, *optional* defaults to `0`):
             The number of static categorical features.
-        num_feat_static_real (`int`, *optional* default to `0`):
+        num_feat_static_real (`int`, *optional* defaults to `0`):
             The number of static real valued features.
-        cardinality (`list` of `int`, *optional* default to `None`):
+        cardinality (`list` of `int`, *optional*):
             The cardinality of the categorical features. Cannot be `None` if `num_feat_static_cat` is `> 0`.
-        embedding_dimension (`list` of `int`, *optional* default to `None`):
+        embedding_dimension (`list` of `int`, *optional*):
             The dimension of the embedding for the categorical features. Cannot be `None` if `num_feat_static_cat` is `> 0`.
         encoder_layers (`int`, *optional*, defaults to 2):
             Number of encoder layers.
@@ -104,8 +104,8 @@ def __init__(
         freq: Optional[str] = None,
         prediction_length: Optional[int] = None,
         context_length: Optional[int] = None,
-        distr_output: str = "StudentT",
-        loss: str = "NLL",
+        distribution_output: str = "StudentT",
+        loss: str = "nll",
         lags_seq: Optional[List[int]] = None,
         scaling: bool = True,
         num_feat_dynamic_real: int = 0,
@@ -127,7 +127,7 @@ def __init__(
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
         self.freq = freq or "1S"
-        self.distr_output = distr_output
+        self.distribution_output = distribution_output
         self.loss = loss
         self.input_size = input_size
         self.num_time_features = (
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 901d15b9c4fd9..f57be04cf56bc 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1305,12 +1305,12 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
         self.config = config
         self.transformer = TimeSeriesTransformerModel(config)
-        if config.distr_output == "StudentT":
-            self.distr_output = StudentTOutput()
-            self.param_proj = self.distr_output.get_args_proj(self.transformer.d_model)
-            self.target_shape = self.distr_output.event_shape
+        if config.distribution_output == "student_t":
+            self.distribution_output = StudentTOutput()
+            self.param_proj = self.distribution_output.get_args_proj(self.transformer.d_model)
+            self.target_shape = self.distribution_output.event_shape
 
-        if config.loss == "NLL":
+        if config.loss == "nll":
             self.loss = NegativeLogLikelihood()
 
     def output_params(self, dec_output):
@@ -1321,7 +1321,7 @@ def output_distribution(self, params, scale=None, trailing_n=None) -> torch.dist
         sliced_params = params
         if trailing_n is not None:
             sliced_params = [p[:, -trailing_n:] for p in params]
-        return self.distr_output.distribution(sliced_params, scale=scale)
+        return self.distribution_output.distribution(sliced_params, scale=scale)
 
     def forward(self, batch):
         feat_static_cat = batch["feat_static_cat"]

From bcff605e7652582707dc8fa94e952377b69a86d5 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 12 Jul 2022 12:32:42 +0100
Subject: [PATCH 035/164] fix second

---
 .../configuration_time_series_transformer.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 4320e9a099ed5..d21efe66bd62a 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -55,7 +55,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             Whether to scale the input targets.
         freq (`str`, *optional*):
             The frequency of the input time series. If `None`, the `lags_seq` and `num_time_features` are set at
-            the finest temporal resolution of 1 Second.
+            the finest temporal resolution of 1 second.
         lags_seq (`list` of `int`, *optional*):
             The lags of the input time series. If `None`, the `freq` is used to determine the lags.
         num_feat_dynamic_real (`int`, *optional* defaults to `0`):

From cce6973765c18c218736435c2ab96bdbb4e1e0b1 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 12 Jul 2022 12:34:46 +0100
Subject: [PATCH 036/164] fix default

---
 .../configuration_time_series_transformer.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index d21efe66bd62a..5528cfbab916c 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -104,7 +104,7 @@ def __init__(
         freq: Optional[str] = None,
         prediction_length: Optional[int] = None,
         context_length: Optional[int] = None,
-        distribution_output: str = "StudentT",
+        distribution_output: str = "student_t",
         loss: str = "nll",
         lags_seq: Optional[List[int]] = None,
         scaling: bool = True,

From 216c2062484dd56ae294c3e03699a1c7cd660c46 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 13 Jul 2022 07:25:47 +0100
Subject: [PATCH 037/164] fix order

---
 .../configuration_time_series_transformer.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 5528cfbab916c..2ede56ba372ed 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -85,7 +85,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
         Example:
 
     ```python
-    >>> from transformers import TimeSeriesTransformerModel, TimeSeriesTransformerConfig
+    >>> from transformers import TimeSeriesTransformerConfig, TimeSeriesTransformerModel
 
     >>> # Initializing a TimeSeriesTransformer huggingface/tst-ett style configuration
     >>> configuration = TimeSeriesTransformerConfig()

From 8dae3cfaf36af210c1a21161cd3c6a625522f3b5 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 13 Jul 2022 07:29:52 +0100
Subject: [PATCH 038/164] use config directly

---
 .../modeling_time_series_transformer.py       | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index f57be04cf56bc..d5d59d2a1161f 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -56,7 +56,6 @@
 
 _CHECKPOINT_FOR_DOC = "huggingface/tst-ett"
 _CONFIG_FOR_DOC = "TimeSeriesTransformerConfig"
-_TOKENIZER_FOR_DOC = "TimeSeriesTransformerTokenizer"
 
 
 TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -1096,34 +1095,34 @@ def _number_of_features(self) -> int:
     def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
 
-        if self.config.scaling:
+        if config.scaling:
             self.scaler = MeanScaler(dim=1, keepdim=True)
         else:
             self.scaler = NOPScaler(dim=1, keepdim=True)
 
         self.embedder = FeatureEmbedder(
-            cardinalities=self.config.cardinality,
-            embedding_dims=self.config.embedding_dimension,
+            cardinalities=config.cardinality,
+            embedding_dims=config.embedding_dimension,
         )
 
-        self.d_model = self.config.input_size * len(self.config.lags_seq) + self._number_of_features
+        self.d_model = config.input_size * len(config.lags_seq) + self._number_of_features
 
         # transformer enc-decoder and mask initializer
         self.transformer = nn.Transformer(
             d_model=self.d_model,
-            nhead=self.config.nhead,
-            num_encoder_layers=self.config.encoder_layers,
-            num_decoder_layers=self.config.decoder_layers,
-            dim_feedforward=self.config.ffn_dim,
-            dropout=self.config.dropout,
-            activation=self.config.activation_function,
+            nhead=config.nhead,
+            num_encoder_layers=config.encoder_layers,
+            num_decoder_layers=config.decoder_layers,
+            dim_feedforward=config.ffn_dim,
+            dropout=config.dropout,
+            activation=config.activation_function,
             batch_first=True,
         )
 
         # causal decoder tgt mask
         self.register_buffer(
             "tgt_mask",
-            self.transformer.generate_square_subsequent_mask(self.config.prediction_length),
+            self.transformer.generate_square_subsequent_mask(config.prediction_length),
         )
 
         # Initialize weights and apply final processing

From 9afbac2c6b7b49f2d1df850b008bb4c1c4443c7c Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 13 Jul 2022 07:54:39 +0100
Subject: [PATCH 039/164] undo change

---
 src/transformers/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4c47a292f4ef2..4fab68db82a26 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3029,7 +3029,6 @@
     from .models.time_series_transformer import (
         TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TimeSeriesTransformerConfig,
-        TimeSeriesTransformerTokenizer,
     )
     from .models.trajectory_transformer import (
         TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -3253,7 +3252,6 @@
         from .models.splinter import SplinterTokenizerFast
         from .models.squeezebert import SqueezeBertTokenizerFast
         from .models.t5 import T5TokenizerFast
-        from .models.time_series_transformer import TimeSeriesTransformerTokenizerFast
         from .models.xglm import XGLMTokenizerFast
         from .models.xlm_roberta import XLMRobertaTokenizerFast
         from .models.xlnet import XLNetTokenizerFast
@@ -3404,6 +3402,8 @@
         )
         from .generation_utils import top_k_top_p_filtering
         from .modeling_utils import PreTrainedModel
+
+        # PyTorch model imports
         from .models.albert import (
             ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             AlbertForMaskedLM,

From 2c06f970c0d9ff0857e4cce319b23510d8e33bbc Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 13 Jul 2022 07:56:43 +0100
Subject: [PATCH 040/164] fix comment

---
 src/transformers/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4fab68db82a26..3f87ab68efe51 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -4215,8 +4215,6 @@
             T5PreTrainedModel,
             load_tf_weights_in_t5,
         )
-
-        # PyTorch model imports
         from .models.time_series_transformer import (
             TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             TimeSeriesTransformerForCausalLM,

From 050b8b5ab4f381460e4f1a9401d7c164377bf58c Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 13 Jul 2022 07:59:04 +0100
Subject: [PATCH 041/164] fix year

---
 src/transformers/models/time_series_transformer/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/__init__.py b/src/transformers/models/time_series_transformer/__init__.py
index 4e55f7fb8a1f1..0fbb3e85314a8 100644
--- a/src/transformers/models/time_series_transformer/__init__.py
+++ b/src/transformers/models/time_series_transformer/__init__.py
@@ -2,7 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.
 
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,7 +27,6 @@
         "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TimeSeriesTransformerConfig",
     ],
-    "tokenization_time_series_transformer": ["TimeSeriesTransformerTokenizer"],
 }
 
 try:

From d0861859715781755d9952a03188ad67a8a7e518 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 15 Jul 2022 10:34:41 +0200
Subject: [PATCH 042/164] fix import

---
 .../modeling_time_series_transformer.py                        | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index d5d59d2a1161f..ab609381e2488 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1259,7 +1259,6 @@ def get_decoder(self):
 
     @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1557,7 +1556,6 @@ def __init__(self, config: TimeSeriesTransformerConfig, **kwargs):
 
     @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1674,7 +1672,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,

From dc2acd4b5ff2bd09886c9d58742c838fc177cbb9 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 15 Jul 2022 10:46:07 +0200
Subject: [PATCH 043/164] add additional arguments for training vs. test

---
 .../modeling_time_series_transformer.py       | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index ab609381e2488..41c342696d2f9 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1321,15 +1321,17 @@ def output_distribution(self, params, scale=None, trailing_n=None) -> torch.dist
             sliced_params = [p[:, -trailing_n:] for p in params]
         return self.distribution_output.distribution(sliced_params, scale=scale)
 
-    def forward(self, batch):
-        feat_static_cat = batch["feat_static_cat"]
-        feat_static_real = batch["feat_static_real"]
-        past_time_feat = batch["past_time_feat"]
-        past_target = batch["past_target"]
-        future_time_feat = batch["future_time_feat"]
-        future_target = batch["future_target"]
-        past_observed_values = batch["past_observed_values"]
-        future_observed_values = batch["future_observed_values"]
+    def forward(
+        self,
+        feat_static_cat: torch.Tensor,
+        feat_static_real: torch.Tensor,
+        past_time_feat: torch.Tensor,
+        past_target: torch.Tensor,
+        past_observed_values: torch.Tensor,
+        future_time_feat: Optional[torch.Tensor] = None,
+        future_target: Optional[torch.Tensor] = None,
+        future_observed_values: Optional[torch.Tensor] = None,
+    ):
 
         dec_output, scale = self.transformer(
             feat_static_cat,

From 758b90de84c1df7d730714f1395053d53eb1cc41 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 15 Jul 2022 11:04:23 +0200
Subject: [PATCH 044/164] initial greedy inference loop

---
 .../configuration_time_series_transformer.py  |  4 +
 .../modeling_time_series_transformer.py       | 87 +++++++++++++++----
 2 files changed, 75 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 2ede56ba372ed..1845fd000914e 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -81,6 +81,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             `"gelu"` and `"relu"` are supported.
         dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the encoder, and decoder.
+        num_parallel_samples (`int`, *optional*, defaults to 100):
+            The number of samples to generate in parallel for each time step of inference.
 
         Example:
 
@@ -120,6 +122,7 @@ def __init__(
         is_encoder_decoder: bool = True,
         activation_function: str = "gelu",
         dropout: float = 0.1,
+        num_parallel_samples: int = 100,
         init_std: float = 0.02,
         **kwargs
     ):
@@ -140,6 +143,7 @@ def __init__(
         self.num_feat_static_cat = num_feat_static_cat
         self.cardinality = cardinality if cardinality and num_feat_static_cat > 0 else [1]
         self.embedding_dimension = embedding_dimension or [min(50, (cat + 1) // 2) for cat in self.cardinality]
+        self.num_parallel_samples = num_parallel_samples
 
         # Transformer architecture parameters
         self.nhead = nhead
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 41c342696d2f9..de98d63501878 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1333,26 +1333,81 @@ def forward(
         future_observed_values: Optional[torch.Tensor] = None,
     ):
 
-        dec_output, scale = self.transformer(
-            feat_static_cat,
-            feat_static_real,
-            past_time_feat,
-            past_target,
-            past_observed_values,
-            future_time_feat,
-            future_target,
-        )
-        params = self.output_params(dec_output)
-        distr = self.output_distribution(params, scale)
+        loss = None
+
+        if future_target is not None and future_observed_values is not None:
+            # training
+            dec_output, scale = self.transformer(
+                feat_static_cat,
+                feat_static_real,
+                past_time_feat,
+                past_target,
+                past_observed_values,
+                future_time_feat,
+                future_target,
+            )
+            params = self.output_params(dec_output)
+            distr = self.output_distribution(params, scale)
 
-        loss_values = self.loss(distr, future_target)
+            loss = self.loss(distr, future_target)
 
-        if len(self.target_shape) == 0:
-            loss_weights = future_observed_values
+            if len(self.target_shape) == 0:
+                loss_weights = future_observed_values
+            else:
+                loss_weights = future_observed_values.min(dim=-1, keepdim=False)
+
+            return weighted_average(loss, weights=loss_weights)
         else:
-            loss_weights = future_observed_values.min(dim=-1, keepdim=False)
+            # prediction
+            num_parallel_samples = self.config.num_parallel_samples
+
+            encoder_inputs, scale, static_feat = self.create_network_inputs(
+                feat_static_cat,
+                feat_static_real,
+                past_time_feat,
+                past_target,
+                past_observed_values,
+            )
+            enc_out = self.transformer.encoder(encoder_inputs)
 
-        return weighted_average(loss_values, weights=loss_weights)
+            repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+            repeated_past_target = past_target.repeat_interleave(repeats=num_parallel_samples, dim=0) / repeated_scale
+
+            expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_feat.shape[1], -1)
+            features = torch.cat((expanded_static_feat, future_time_feat), dim=-1)
+            repeated_features = features.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+            repeated_enc_out = enc_out.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+            future_samples = []
+
+            # greedy decoding
+            for k in range(self.prediction_length):
+                lagged_sequence = self.get_lagged_subsequences(
+                    sequence=repeated_past_target,
+                    subsequences_length=1 + k,
+                    shift=1,
+                )
+
+                lags_shape = lagged_sequence.shape
+                reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+
+                decoder_input = torch.cat((reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1)
+
+                output = self.transformer.decoder(decoder_input, repeated_enc_out)
+
+                params = self.param_proj(output[:, -1:])
+                distr = self.output_distribution(params, scale=repeated_scale)
+                next_sample = distr.sample()
+
+                repeated_past_target = torch.cat((repeated_past_target, next_sample / repeated_scale), dim=1)
+                future_samples.append(next_sample)
+
+            concat_future_samples = torch.cat(future_samples, dim=1)
+            return concat_future_samples.reshape(
+                (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
+            )
 
 
 @add_start_docstrings(

From c9f854092de21d814ce341d76a270d2d9dff56b9 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 15 Jul 2022 11:43:51 +0200
Subject: [PATCH 045/164] fix inference

---
 .../modeling_time_series_transformer.py                | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index de98d63501878..172cdcb2f265a 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1359,6 +1359,9 @@ def forward(
             return weighted_average(loss, weights=loss_weights)
         else:
             # prediction
+            encoder = self.transformer.get_encoder()
+            decoder = self.transformer.get_decoder()
+
             num_parallel_samples = self.config.num_parallel_samples
 
             encoder_inputs, scale, static_feat = self.create_network_inputs(
@@ -1368,7 +1371,8 @@ def forward(
                 past_target,
                 past_observed_values,
             )
-            enc_out = self.transformer.encoder(encoder_inputs)
+
+            enc_out = encoder(encoder_inputs)
 
             repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
 
@@ -1383,7 +1387,7 @@ def forward(
             future_samples = []
 
             # greedy decoding
-            for k in range(self.prediction_length):
+            for k in range(self.config.prediction_length):
                 lagged_sequence = self.get_lagged_subsequences(
                     sequence=repeated_past_target,
                     subsequences_length=1 + k,
@@ -1395,7 +1399,7 @@ def forward(
 
                 decoder_input = torch.cat((reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1)
 
-                output = self.transformer.decoder(decoder_input, repeated_enc_out)
+                output = decoder(decoder_input, repeated_enc_out)
 
                 params = self.param_proj(output[:, -1:])
                 distr = self.output_distribution(params, scale=repeated_scale)

From 97d67437d0dac7c115ba9d29c28397e5102c3500 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 21 Jul 2022 10:34:30 +0200
Subject: [PATCH 046/164] comment out token inputs to enc dec

---
 .../modeling_time_series_transformer.py       | 124 +++++++++---------
 1 file changed, 65 insertions(+), 59 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 172cdcb2f265a..97e17ce23f590 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -657,19 +657,21 @@ def __init__(self, config: TimeSeriesTransformerConfig, embed_tokens: Optional[n
         self.layerdrop = config.encoder_layerdrop
 
         embed_dim = config.d_model
-        self.padding_idx = config.pad_token_id
-        self.max_source_positions = config.max_position_embeddings
-        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+        # self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
 
-        if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+        # self.padding_idx = config.pad_token_id
+        # self.max_source_positions = config.max_position_embeddings
+
+        # if embed_tokens is not None:
+        #     self.embed_tokens = embed_tokens
+        # else:
+        #     self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+
+        # self.embed_positions = TimeSeriesTransformerLearnedPositionalEmbedding(
+        #     config.max_position_embeddings,
+        #     embed_dim,
+        # )
 
-        self.embed_positions = TimeSeriesTransformerLearnedPositionalEmbedding(
-            config.max_position_embeddings,
-            embed_dim,
-        )
         self.layers = nn.ModuleList([TimeSeriesTransformerEncoderLayer(config) for _ in range(config.encoder_layers)])
         self.layernorm_embedding = nn.LayerNorm(embed_dim)
 
@@ -679,10 +681,10 @@ def __init__(self, config: TimeSeriesTransformerConfig, embed_tokens: Optional[n
 
     def forward(
         self,
-        input_ids=None,
+        inputs_embeds: torch.Tensor,
+        # input_ids=None,
         attention_mask=None,
         head_mask=None,
-        inputs_embeds=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
@@ -730,23 +732,24 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        # # retrieve input_ids and inputs_embeds
+        # if input_ids is not None and inputs_embeds is not None:
+        #     raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        # elif input_ids is not None:
+        #     input_shape = input_ids.size()
+        #     input_ids = input_ids.view(-1, input_shape[-1])
+        # elif inputs_embeds is not None:
+        #     input_shape = inputs_embeds.size()[:-1]
+        # else:
+        #     raise ValueError("You have to specify either input_ids or inputs_embeds")
 
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        # if inputs_embeds is None:
+        #     inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
-        embed_pos = self.embed_positions(input_shape)
+        # embed_pos = self.embed_positions(input_shape)
+
+        hidden_states = inputs_embeds  # + embed_pos
 
-        hidden_states = inputs_embeds + embed_pos
         hidden_states = self.layernorm_embedding(hidden_states)
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
@@ -822,19 +825,21 @@ def __init__(self, config: TimeSeriesTransformerConfig, embed_tokens: Optional[n
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
-        self.padding_idx = config.pad_token_id
-        self.max_target_positions = config.max_position_embeddings
-        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
 
-        if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+        # self.padding_idx = config.pad_token_id
+        # self.max_target_positions = config.max_position_embeddings
+        # self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        # if embed_tokens is not None:
+        #     self.embed_tokens = embed_tokens
+        # else:
+        #     self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        # self.embed_positions = TimeSeriesTransformerLearnedPositionalEmbedding(
+        #     config.max_position_embeddings,
+        #     config.d_model,
+        # )
 
-        self.embed_positions = TimeSeriesTransformerLearnedPositionalEmbedding(
-            config.max_position_embeddings,
-            config.d_model,
-        )
         self.layers = nn.ModuleList([TimeSeriesTransformerDecoderLayer(config) for _ in range(config.decoder_layers)])
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
@@ -842,11 +847,11 @@ def __init__(self, config: TimeSeriesTransformerConfig, embed_tokens: Optional[n
         # Initialize weights and apply final processing
         self.post_init()
 
-    def get_input_embeddings(self):
-        return self.embed_tokens
+    # def get_input_embeddings(self):
+    #     return self.embed_tokens
 
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
+    # def set_input_embeddings(self, value):
+    #     self.embed_tokens = value
 
     def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
         # create causal mask
@@ -868,14 +873,14 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
     def forward(
         self,
-        input_ids=None,
+        inputs_embeds: torch.Tensor,
+        # input_ids=None,
         attention_mask=None,
         encoder_hidden_states=None,
         encoder_attention_mask=None,
         head_mask=None,
         cross_attn_head_mask=None,
         past_key_values=None,
-        inputs_embeds=None,
         use_cache=None,
         output_attentions=None,
         output_hidden_states=None,
@@ -952,22 +957,23 @@ def forward(
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        # # retrieve input_ids and inputs_embeds
+        # if input_ids is not None and inputs_embeds is not None:
+        #     raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        # elif input_ids is not None:
+        #     input_shape = input_ids.size()
+        #     input_ids = input_ids.view(-1, input_shape[-1])
+        # elif inputs_embeds is not None:
+        #     input_shape = inputs_embeds.size()[:-1]
+        # else:
+        #     raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        input_shape = inputs_embeds.size()[:-1]
 
         # past_key_values_length
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
 
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        # if inputs_embeds is None:
+        #     inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
@@ -978,10 +984,10 @@ def forward(
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
 
-        # embed positions
-        positions = self.embed_positions(input_shape, past_key_values_length)
+        # # embed positions
+        # positions = self.embed_positions(input_shape, past_key_values_length)
 
-        hidden_states = inputs_embeds + positions
+        hidden_states = inputs_embeds  # + positions
         hidden_states = self.layernorm_embedding(hidden_states)
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

From b512c50ffde18e6d58c2ad2ecb6eb3836652caf5 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Sun, 7 Aug 2022 12:10:33 -0400
Subject: [PATCH 047/164] Use HF encoder/decoder

---
 .../configuration_time_series_transformer.py  | 17 +++++
 .../modeling_time_series_transformer.py       | 73 ++++++++++---------
 .../test_modeling_time_series_transformer.py  |  1 -
 3 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 1845fd000914e..0535656334678 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -124,6 +124,7 @@ def __init__(
         dropout: float = 0.1,
         num_parallel_samples: int = 100,
         init_std: float = 0.02,
+        use_cache=True,
         **kwargs
     ):
         # time series specific parameters
@@ -147,11 +148,27 @@ def __init__(
 
         # Transformer architecture parameters
         self.nhead = nhead
+        self.encoder_attention_heads = nhead
+        self.decoder_attention_heads = nhead
+
         self.encoder_layers = encoder_layers
         self.decoder_layers = decoder_layers
         self.ffn_dim = ffn_dim
+        self.encoder_ffn_dim = ffn_dim
+        self.decoder_ffn_dim = ffn_dim
+
         self.dropout = dropout
+        self.attention_dropout = dropout
+        self.activation_dropout = dropout
+        self.encoder_layerdrop = dropout
+        self.decoder_layerdrop = dropout
+
         self.activation_function = activation_function
         self.init_std = init_std
 
+        self.output_attentions = False
+        self.output_hidden_states = False
+
+        self.use_cache = use_cache
+
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 97e17ce23f590..dd8795c26019f 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -108,21 +108,21 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
-class TimeSeriesTransformerLearnedPositionalEmbedding(nn.Embedding):
-    """
-    This module learns positional embeddings up to a fixed maximum size.
-    """
+# class TimeSeriesTransformerLearnedPositionalEmbedding(nn.Embedding):
+#     """
+#     This module learns positional embeddings up to a fixed maximum size.
+#     """
 
-    def __init__(self, num_embeddings: int, embedding_dim: int):
-        super().__init__(num_embeddings, embedding_dim)
+#     def __init__(self, num_embeddings: int, embedding_dim: int):
+#         super().__init__(num_embeddings, embedding_dim)
 
-    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
-        """`input_ids_shape` is expected to be [bsz x seqlen]."""
-        bsz, seq_len = input_ids_shape[:2]
-        positions = torch.arange(
-            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
-        )
-        return super().forward(positions)
+#     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
+#         """`input_ids_shape` is expected to be [bsz x seqlen]."""
+#         bsz, seq_len = input_ids_shape[:2]
+#         positions = torch.arange(
+#             past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+#         )
+#         return super().forward(positions)
 
 
 class TimeSeriesTransformerAttention(nn.Module):
@@ -1111,25 +1111,28 @@ def __init__(self, config: TimeSeriesTransformerConfig):
             embedding_dims=config.embedding_dimension,
         )
 
-        self.d_model = config.input_size * len(config.lags_seq) + self._number_of_features
+        config.d_model = config.input_size * len(config.lags_seq) + self._number_of_features
 
         # transformer enc-decoder and mask initializer
-        self.transformer = nn.Transformer(
-            d_model=self.d_model,
-            nhead=config.nhead,
-            num_encoder_layers=config.encoder_layers,
-            num_decoder_layers=config.decoder_layers,
-            dim_feedforward=config.ffn_dim,
-            dropout=config.dropout,
-            activation=config.activation_function,
-            batch_first=True,
-        )
+        self.encoder = TimeSeriesTransformerEncoder(config)
+        self.decoder = TimeSeriesTransformerDecoder(config)
 
-        # causal decoder tgt mask
-        self.register_buffer(
-            "tgt_mask",
-            self.transformer.generate_square_subsequent_mask(config.prediction_length),
-        )
+        # self.transformer = nn.Transformer(
+        #     d_model=self.d_model,
+        #     nhead=config.nhead,
+        #     num_encoder_layers=config.encoder_layers,
+        #     num_decoder_layers=config.decoder_layers,
+        #     dim_feedforward=config.ffn_dim,
+        #     dropout=config.dropout,
+        #     activation=config.activation_function,
+        #     batch_first=True,
+        # )
+
+        # # causal decoder tgt mask
+        # self.register_buffer(
+        #     "tgt_mask",
+        #     self.transformer.generate_square_subsequent_mask(config.prediction_length),
+        # )
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1246,8 +1249,8 @@ def output_params(self, transformer_inputs):
         enc_input = transformer_inputs[:, : self.config.context_length, ...]
         dec_input = transformer_inputs[:, self.config.context_length :, ...]
 
-        enc_out = self.transformer.encoder(enc_input)
-        return self.transformer.decoder(dec_input, enc_out, tgt_mask=self.tgt_mask)
+        encoder_outputs = self.encoder(inputs_embeds=enc_input)
+        return self.decoder(inputs_embeds=dec_input, encoder_hidden_states=encoder_outputs.last_hidden_state)
 
     def get_input_embeddings(self):
         return self.shared
@@ -1258,10 +1261,10 @@ def set_input_embeddings(self, value):
         self.decoder.embed_tokens = self.shared
 
     def get_encoder(self):
-        return self.transformer.encoder
+        return self.encoder
 
     def get_decoder(self):
-        return self.transformer.decoder
+        return self.decoder
 
     @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
@@ -1290,7 +1293,7 @@ def forward(
         )
         dec_output = self.output_params(transformer_inputs)
 
-        return dec_output, scale
+        return dec_output.last_hidden_state, scale
 
         # return Seq2SeqModelOutput(
         #     last_hidden_state=decoder_outputs.last_hidden_state,
@@ -1311,7 +1314,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         self.transformer = TimeSeriesTransformerModel(config)
         if config.distribution_output == "student_t":
             self.distribution_output = StudentTOutput()
-            self.param_proj = self.distribution_output.get_args_proj(self.transformer.d_model)
+            self.param_proj = self.distribution_output.get_args_proj(self.transformer.config.d_model)
             self.target_shape = self.distribution_output.event_shape
 
         if config.loss == "nll":
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 410ee4101f526..0faaa53b43ef7 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -39,7 +39,6 @@
         TimeSeriesTransformerForSequenceClassification,
         TimeSeriesTransformerForPrediction,
         TimeSeriesTransformerModel,
-        TimeSeriesTransformerTokenizer,
     )
     from transformers.models.time_series_transformer.modeling_time_series_transformer import (
         TimeSeriesTransformerDecoder,

From 7f332f2ddbd8a1a09c8b6ee89f3c190e52bb8277 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Sun, 7 Aug 2022 12:17:34 -0400
Subject: [PATCH 048/164] fix inference

---
 .../modeling_time_series_transformer.py                | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index dd8795c26019f..c3dcb28968f61 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1381,7 +1381,8 @@ def forward(
                 past_observed_values,
             )
 
-            enc_out = encoder(encoder_inputs)
+            encoder_outputs = encoder(inputs_embeds=encoder_inputs)
+            enc_last_hidden = encoder_outputs.last_hidden_state
 
             repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
 
@@ -1391,7 +1392,7 @@ def forward(
             features = torch.cat((expanded_static_feat, future_time_feat), dim=-1)
             repeated_features = features.repeat_interleave(repeats=num_parallel_samples, dim=0)
 
-            repeated_enc_out = enc_out.repeat_interleave(repeats=num_parallel_samples, dim=0)
+            repeated_enc_last_hidden = enc_last_hidden.repeat_interleave(repeats=num_parallel_samples, dim=0)
 
             future_samples = []
 
@@ -1408,9 +1409,10 @@ def forward(
 
                 decoder_input = torch.cat((reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1)
 
-                output = decoder(decoder_input, repeated_enc_out)
+                dec_output = decoder(inputs_embeds=decoder_input, encoder_hidden_states=repeated_enc_last_hidden)
+                dec_last_hidden = dec_output.last_hidden_state
 
-                params = self.param_proj(output[:, -1:])
+                params = self.param_proj(dec_last_hidden[:, -1:])
                 distr = self.output_distribution(params, scale=repeated_scale)
                 next_sample = distr.sample()
 

From 3bce7cc74066bcbf08ba679e67ba1ce68710e34b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 9 Aug 2022 16:56:02 -0400
Subject: [PATCH 049/164] Use Seq2SeqTSModelOutput output

---
 .../modeling_time_series_transformer.py       | 148 +++++++++++++-----
 1 file changed, 106 insertions(+), 42 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index c3dcb28968f61..1128d63e30303 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """ PyTorch TimeSeriesTransformer model. """
 
-import math
 import copy
+from dataclasses import dataclass
 import random
 from typing import Optional, Tuple, List
 
@@ -108,6 +108,69 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
+@dataclass
+class Seq2SeqTSModelOutput(Seq2SeqModelOutput):
+    """
+    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
+    decoding.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        scale
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    scale: Optional[float] = None
+
+
 # class TimeSeriesTransformerLearnedPositionalEmbedding(nn.Embedding):
 #     """
 #     This module learns positional embeddings up to a fixed maximum size.
@@ -1117,23 +1180,6 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         self.encoder = TimeSeriesTransformerEncoder(config)
         self.decoder = TimeSeriesTransformerDecoder(config)
 
-        # self.transformer = nn.Transformer(
-        #     d_model=self.d_model,
-        #     nhead=config.nhead,
-        #     num_encoder_layers=config.encoder_layers,
-        #     num_decoder_layers=config.decoder_layers,
-        #     dim_feedforward=config.ffn_dim,
-        #     dropout=config.dropout,
-        #     activation=config.activation_function,
-        #     batch_first=True,
-        # )
-
-        # # causal decoder tgt mask
-        # self.register_buffer(
-        #     "tgt_mask",
-        #     self.transformer.generate_square_subsequent_mask(config.prediction_length),
-        # )
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1245,12 +1291,15 @@ def create_network_inputs(
 
         return transformer_inputs, scale, static_feat
 
-    def output_params(self, transformer_inputs):
+    def enc_dec_outputs(self, transformer_inputs):
         enc_input = transformer_inputs[:, : self.config.context_length, ...]
         dec_input = transformer_inputs[:, self.config.context_length :, ...]
 
         encoder_outputs = self.encoder(inputs_embeds=enc_input)
-        return self.decoder(inputs_embeds=dec_input, encoder_hidden_states=encoder_outputs.last_hidden_state)
+        decoder_outputs = self.decoder(
+            inputs_embeds=dec_input, encoder_hidden_states=encoder_outputs.last_hidden_state
+        )
+        return encoder_outputs, decoder_outputs
 
     def get_input_embeddings(self):
         return self.shared
@@ -1269,7 +1318,7 @@ def get_decoder(self):
     @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=Seq2SeqModelOutput,
+        output_type=Seq2SeqTSModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
@@ -1281,6 +1330,8 @@ def forward(
         past_observed_values: torch.Tensor,
         future_time_feat: Optional[torch.Tensor] = None,
         future_target: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        return_dict: Optional[bool] = None,
     ):
         transformer_inputs, scale, _ = self.create_network_inputs(
             feat_static_cat,
@@ -1291,30 +1342,43 @@ def forward(
             future_time_feat,
             future_target,
         )
-        dec_output = self.output_params(transformer_inputs)
-
-        return dec_output.last_hidden_state, scale
-
-        # return Seq2SeqModelOutput(
-        #     last_hidden_state=decoder_outputs.last_hidden_state,
-        #     past_key_values=decoder_outputs.past_key_values,
-        #     decoder_hidden_states=decoder_outputs.hidden_states,
-        #     decoder_attentions=decoder_outputs.attentions,
-        #     cross_attentions=decoder_outputs.cross_attentions,
-        #     encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-        #     encoder_hidden_states=encoder_outputs.hidden_states,
-        #     encoder_attentions=encoder_outputs.attentions,
-        # )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if encoder_outputs is None:
+            enc_input = transformer_inputs[:, : self.config.context_length, ...]
+            encoder_outputs = self.encoder(inputs_embeds=enc_input)
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs,
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        dec_input = transformer_inputs[:, self.config.context_length :, ...]
+        decoder_outputs = self.decoder(
+            inputs_embeds=dec_input, encoder_hidden_states=encoder_outputs.last_hidden_state
+        )
+
+        return Seq2SeqTSModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            scale=scale,
+        )
 
 
 class TimeSeriesTransformerForPrediction(TimeSeriesTransformerModel):
     def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
         self.config = config
-        self.transformer = TimeSeriesTransformerModel(config)
+        self.model = TimeSeriesTransformerModel(config)
         if config.distribution_output == "student_t":
             self.distribution_output = StudentTOutput()
-            self.param_proj = self.distribution_output.get_args_proj(self.transformer.config.d_model)
+            self.param_proj = self.distribution_output.get_args_proj(self.model.config.d_model)
             self.target_shape = self.distribution_output.event_shape
 
         if config.loss == "nll":
@@ -1346,7 +1410,7 @@ def forward(
 
         if future_target is not None and future_observed_values is not None:
             # training
-            dec_output, scale = self.transformer(
+            outputs = self.model(
                 feat_static_cat,
                 feat_static_real,
                 past_time_feat,
@@ -1355,8 +1419,8 @@ def forward(
                 future_time_feat,
                 future_target,
             )
-            params = self.output_params(dec_output)
-            distr = self.output_distribution(params, scale)
+            params = self.output_params(outputs.last_hidden_state)
+            distr = self.output_distribution(params, outputs.scale)
 
             loss = self.loss(distr, future_target)
 
@@ -1368,8 +1432,8 @@ def forward(
             return weighted_average(loss, weights=loss_weights)
         else:
             # prediction
-            encoder = self.transformer.get_encoder()
-            decoder = self.transformer.get_decoder()
+            encoder = self.model.get_encoder()
+            decoder = self.model.get_decoder()
 
             num_parallel_samples = self.config.num_parallel_samples
 

From 3577729743ac89215cdb805f72ba16fdc8158e62 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 9 Aug 2022 20:44:31 -0400
Subject: [PATCH 050/164] return Seq2SeqTSPredictionOutput

---
 .../modeling_time_series_transformer.py       | 49 +++++++++++++++++--
 1 file changed, 44 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 1128d63e30303..49a97035c404e 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -41,6 +41,7 @@
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
+    ModelOutput,
     Seq2SeqLMOutput,
     Seq2SeqModelOutput,
     Seq2SeqQuestionAnsweringModelOutput,
@@ -109,7 +110,7 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 
 
 @dataclass
-class Seq2SeqTSModelOutput(Seq2SeqModelOutput):
+class Seq2SeqTSModelOutput(ModelOutput):
     """
     Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
     decoding.
@@ -171,6 +172,19 @@ class Seq2SeqTSModelOutput(Seq2SeqModelOutput):
     scale: Optional[float] = None
 
 
+@dataclass
+class Seq2SeqTSPredictionOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    params: Optional[Tuple[torch.FloatTensor]] = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
 # class TimeSeriesTransformerLearnedPositionalEmbedding(nn.Embedding):
 #     """
 #     This module learns positional embeddings up to a fixed maximum size.
@@ -1387,6 +1401,12 @@ def __init__(self, config: TimeSeriesTransformerConfig):
     def output_params(self, dec_output):
         return self.param_proj(dec_output)
 
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
     @torch.jit.ignore
     def output_distribution(self, params, scale=None, trailing_n=None) -> torch.distributions.Distribution:
         sliced_params = params
@@ -1404,10 +1424,12 @@ def forward(
         future_time_feat: Optional[torch.Tensor] = None,
         future_target: Optional[torch.Tensor] = None,
         future_observed_values: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ):
-
-        loss = None
-
+        prediction_loss = None
         if future_target is not None and future_observed_values is not None:
             # training
             outputs = self.model(
@@ -1418,6 +1440,10 @@ def forward(
                 past_observed_values,
                 future_time_feat,
                 future_target,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
             )
             params = self.output_params(outputs.last_hidden_state)
             distr = self.output_distribution(params, outputs.scale)
@@ -1429,7 +1455,20 @@ def forward(
             else:
                 loss_weights = future_observed_values.min(dim=-1, keepdim=False)
 
-            return weighted_average(loss, weights=loss_weights)
+            prediction_loss = weighted_average(loss, weights=loss_weights)
+
+            return Seq2SeqTSPredictionOutput(
+                loss=prediction_loss,
+                params=params,
+                past_key_values=outputs.past_key_values,
+                decoder_hidden_states=outputs.decoder_hidden_states,
+                decoder_attentions=outputs.decoder_attentions,
+                cross_attentions=outputs.cross_attentions,
+                encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+                encoder_hidden_states=outputs.encoder_hidden_states,
+                encoder_attentions=outputs.encoder_attentions,
+            )
+
         else:
             # prediction
             encoder = self.model.get_encoder()

From 8f7d803178dbff9afeed73b5aece22fb86a8044a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 9 Aug 2022 21:05:21 -0400
Subject: [PATCH 051/164] added default arguments

---
 .../modeling_time_series_transformer.py       | 30 +++++++++++++++++--
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 49a97035c404e..51671d1b5fb3d 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1345,6 +1345,9 @@ def forward(
         future_time_feat: Optional[torch.Tensor] = None,
         future_target: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        output_hidden_states: bool = False,
+        use_cache: bool = False,
+        output_attentions: bool = False,
         return_dict: Optional[bool] = None,
     ):
         transformer_inputs, scale, _ = self.create_network_inputs(
@@ -1356,10 +1359,23 @@ def forward(
             future_time_feat,
             future_target,
         )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         if encoder_outputs is None:
             enc_input = transformer_inputs[:, : self.config.context_length, ...]
-            encoder_outputs = self.encoder(inputs_embeds=enc_input)
+            encoder_outputs = self.encoder(
+                inputs_embeds=enc_input,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
                 last_hidden_state=encoder_outputs,
@@ -1369,9 +1385,17 @@ def forward(
 
         dec_input = transformer_inputs[:, self.config.context_length :, ...]
         decoder_outputs = self.decoder(
-            inputs_embeds=dec_input, encoder_hidden_states=encoder_outputs.last_hidden_state
+            inputs_embeds=dec_input,
+            encoder_hidden_states=encoder_outputs.last_hidden_state,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
         )
 
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
         return Seq2SeqTSModelOutput(
             last_hidden_state=decoder_outputs.last_hidden_state,
             past_key_values=decoder_outputs.past_key_values,
@@ -1440,10 +1464,10 @@ def forward(
                 past_observed_values,
                 future_time_feat,
                 future_target,
-                use_cache=use_cache,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
+                use_cache=use_cache,
             )
             params = self.output_params(outputs.last_hidden_state)
             distr = self.output_distribution(params, outputs.scale)

From c90c1268cb1a9b7a266b28ba654069a4415a2892 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 10 Aug 2022 14:52:48 -0400
Subject: [PATCH 052/164] fix return_dict true

---
 .../modeling_time_series_transformer.py                     | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 51671d1b5fb3d..c7d55e0528a68 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1394,7 +1394,7 @@ def forward(
         )
 
         if not return_dict:
-            return decoder_outputs + encoder_outputs
+            return decoder_outputs + encoder_outputs + (scale,)
 
         return Seq2SeqTSModelOutput(
             last_hidden_state=decoder_outputs.last_hidden_state,
@@ -1481,6 +1481,10 @@ def forward(
 
             prediction_loss = weighted_average(loss, weights=loss_weights)
 
+            if not return_dict:
+                outputs = (params) + outputs[1:]
+                return ((prediction_loss,) + outputs) if prediction_loss is not None else output
+
             return Seq2SeqTSPredictionOutput(
                 loss=prediction_loss,
                 params=params,

From 63d709a493ff7164e708528b7eaceba5bcc12162 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 10 Aug 2022 14:56:42 -0400
Subject: [PATCH 053/164] scale is a tensor

---
 .../modeling_time_series_transformer.py                       | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index c7d55e0528a68..0b529bb7ed5d0 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -169,7 +169,7 @@ class Seq2SeqTSModelOutput(ModelOutput):
     encoder_last_hidden_state: Optional[torch.FloatTensor] = None
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    scale: Optional[float] = None
+    scale: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -183,6 +183,7 @@ class Seq2SeqTSPredictionOutput(ModelOutput):
     encoder_last_hidden_state: Optional[torch.FloatTensor] = None
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    scale: Optional[torch.FloatTensor] = None
 
 
 # class TimeSeriesTransformerLearnedPositionalEmbedding(nn.Embedding):
@@ -1495,6 +1496,7 @@ def forward(
                 encoder_last_hidden_state=outputs.encoder_last_hidden_state,
                 encoder_hidden_states=outputs.encoder_hidden_states,
                 encoder_attentions=outputs.encoder_attentions,
+                scale=outputs.scale,
             )
 
         else:

From bcd566f8e954fd05a08828c9435e5cbe714bd27f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 12 Aug 2022 16:04:54 -0400
Subject: [PATCH 054/164] output static_features for inference

---
 .../modeling_time_series_transformer.py       | 34 +++++++++++--------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 0b529bb7ed5d0..4eb2c2e7eea23 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -170,6 +170,7 @@ class Seq2SeqTSModelOutput(ModelOutput):
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
     scale: Optional[torch.FloatTensor] = None
+    static_features: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -184,6 +185,7 @@ class Seq2SeqTSPredictionOutput(ModelOutput):
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
     scale: Optional[torch.FloatTensor] = None
+    static_features: Optional[torch.FloatTensor] = None
 
 
 # class TimeSeriesTransformerLearnedPositionalEmbedding(nn.Embedding):
@@ -1351,14 +1353,14 @@ def forward(
         output_attentions: bool = False,
         return_dict: Optional[bool] = None,
     ):
-        transformer_inputs, scale, _ = self.create_network_inputs(
-            feat_static_cat,
-            feat_static_real,
-            past_time_feat,
-            past_target,
-            past_observed_values,
-            future_time_feat,
-            future_target,
+        transformer_inputs, scale, static_feat = self.create_network_inputs(
+            feat_static_real=feat_static_cat,
+            feat_static_real=feat_static_real,
+            past_time_feat=past_time_feat,
+            past_target=past_target,
+            past_observed_values=past_observed_values,
+            future_time_feat=future_time_feat,
+            future_target=future_target,
         )
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1395,7 +1397,7 @@ def forward(
         )
 
         if not return_dict:
-            return decoder_outputs + encoder_outputs + (scale,)
+            return decoder_outputs + encoder_outputs + (scale, static_feat)
 
         return Seq2SeqTSModelOutput(
             last_hidden_state=decoder_outputs.last_hidden_state,
@@ -1407,6 +1409,7 @@ def forward(
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
             scale=scale,
+            static_features=static_feat,
         )
 
 
@@ -1484,7 +1487,7 @@ def forward(
 
             if not return_dict:
                 outputs = (params) + outputs[1:]
-                return ((prediction_loss,) + outputs) if prediction_loss is not None else output
+                return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
 
             return Seq2SeqTSPredictionOutput(
                 loss=prediction_loss,
@@ -1497,6 +1500,7 @@ def forward(
                 encoder_hidden_states=outputs.encoder_hidden_states,
                 encoder_attentions=outputs.encoder_attentions,
                 scale=outputs.scale,
+                static_features=outputs.static_features,
             )
 
         else:
@@ -1507,11 +1511,11 @@ def forward(
             num_parallel_samples = self.config.num_parallel_samples
 
             encoder_inputs, scale, static_feat = self.create_network_inputs(
-                feat_static_cat,
-                feat_static_real,
-                past_time_feat,
-                past_target,
-                past_observed_values,
+                feat_static_cat=feat_static_cat,
+                feat_static_real=feat_static_real,
+                past_time_feat=past_time_feat,
+                past_target=past_target,
+                past_observed_values=past_observed_values,
             )
 
             encoder_outputs = encoder(inputs_embeds=encoder_inputs)

From 6c6a57bfd221b9a638d9c2432303ac862ead88dc Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 12 Aug 2022 17:02:39 -0400
Subject: [PATCH 055/164] clean up some unused bits

---
 .../modeling_time_series_transformer.py       | 112 +-----------------
 1 file changed, 5 insertions(+), 107 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 4eb2c2e7eea23..5a27905ac4167 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -159,6 +159,7 @@ class Seq2SeqTSModelOutput(ModelOutput):
             Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
             self-attention heads.
         scale
+        static_features
     """
 
     last_hidden_state: torch.FloatTensor = None
@@ -188,23 +189,6 @@ class Seq2SeqTSPredictionOutput(ModelOutput):
     static_features: Optional[torch.FloatTensor] = None
 
 
-# class TimeSeriesTransformerLearnedPositionalEmbedding(nn.Embedding):
-#     """
-#     This module learns positional embeddings up to a fixed maximum size.
-#     """
-
-#     def __init__(self, num_embeddings: int, embedding_dim: int):
-#         super().__init__(num_embeddings, embedding_dim)
-
-#     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
-#         """`input_ids_shape` is expected to be [bsz x seqlen]."""
-#         bsz, seq_len = input_ids_shape[:2]
-#         positions = torch.arange(
-#             past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
-#         )
-#         return super().forward(positions)
-
-
 class TimeSeriesTransformerAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -727,30 +711,15 @@ class TimeSeriesTransformerEncoder(TimeSeriesTransformerPreTrainedModel):
 
     Args:
         config: TimeSeriesTransformerConfig
-        embed_tokens (nn.Embedding): output embedding
     """
 
-    def __init__(self, config: TimeSeriesTransformerConfig, embed_tokens: Optional[nn.Embedding] = None):
+    def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
 
         self.dropout = config.dropout
         self.layerdrop = config.encoder_layerdrop
 
         embed_dim = config.d_model
-        # self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
-
-        # self.padding_idx = config.pad_token_id
-        # self.max_source_positions = config.max_position_embeddings
-
-        # if embed_tokens is not None:
-        #     self.embed_tokens = embed_tokens
-        # else:
-        #     self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
-
-        # self.embed_positions = TimeSeriesTransformerLearnedPositionalEmbedding(
-        #     config.max_position_embeddings,
-        #     embed_dim,
-        # )
 
         self.layers = nn.ModuleList([TimeSeriesTransformerEncoderLayer(config) for _ in range(config.encoder_layers)])
         self.layernorm_embedding = nn.LayerNorm(embed_dim)
@@ -762,7 +731,6 @@ def __init__(self, config: TimeSeriesTransformerConfig, embed_tokens: Optional[n
     def forward(
         self,
         inputs_embeds: torch.Tensor,
-        # input_ids=None,
         attention_mask=None,
         head_mask=None,
         output_attentions=None,
@@ -771,15 +739,7 @@ def forward(
     ):
         r"""
         Args:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it.
-
-                Indices can be obtained using [`~TimeSeriesTransformerTokenizer`]. See
-                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
-                for details.
-
-                [What are input IDs?](../glossary#input-ids)
+            inputs_embeds
             attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -812,22 +772,6 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        # # retrieve input_ids and inputs_embeds
-        # if input_ids is not None and inputs_embeds is not None:
-        #     raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        # elif input_ids is not None:
-        #     input_shape = input_ids.size()
-        #     input_ids = input_ids.view(-1, input_shape[-1])
-        # elif inputs_embeds is not None:
-        #     input_shape = inputs_embeds.size()[:-1]
-        # else:
-        #     raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        # if inputs_embeds is None:
-        #     inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-
-        # embed_pos = self.embed_positions(input_shape)
-
         hidden_states = inputs_embeds  # + embed_pos
 
         hidden_states = self.layernorm_embedding(hidden_states)
@@ -898,28 +842,13 @@ class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel):
 
     Args:
         config: TimeSeriesTransformerConfig
-        embed_tokens (nn.Embedding): output embedding
     """
 
-    def __init__(self, config: TimeSeriesTransformerConfig, embed_tokens: Optional[nn.Embedding] = None):
+    def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
 
-        # self.padding_idx = config.pad_token_id
-        # self.max_target_positions = config.max_position_embeddings
-        # self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
-
-        # if embed_tokens is not None:
-        #     self.embed_tokens = embed_tokens
-        # else:
-        #     self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
-
-        # self.embed_positions = TimeSeriesTransformerLearnedPositionalEmbedding(
-        #     config.max_position_embeddings,
-        #     config.d_model,
-        # )
-
         self.layers = nn.ModuleList([TimeSeriesTransformerDecoderLayer(config) for _ in range(config.decoder_layers)])
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
@@ -927,12 +856,6 @@ def __init__(self, config: TimeSeriesTransformerConfig, embed_tokens: Optional[n
         # Initialize weights and apply final processing
         self.post_init()
 
-    # def get_input_embeddings(self):
-    #     return self.embed_tokens
-
-    # def set_input_embeddings(self, value):
-    #     self.embed_tokens = value
-
     def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
         # create causal mask
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -954,7 +877,6 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
     def forward(
         self,
         inputs_embeds: torch.Tensor,
-        # input_ids=None,
         attention_mask=None,
         encoder_hidden_states=None,
         encoder_attention_mask=None,
@@ -968,15 +890,7 @@ def forward(
     ):
         r"""
         Args:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it.
-
-                Indices can be obtained using [`~TimeSeriesTransformerTokenizer`]. See
-                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
-                for details.
-
-                [What are input IDs?](../glossary#input-ids)
+            inputs_embeds
             attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -1037,24 +951,11 @@ def forward(
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        # # retrieve input_ids and inputs_embeds
-        # if input_ids is not None and inputs_embeds is not None:
-        #     raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        # elif input_ids is not None:
-        #     input_shape = input_ids.size()
-        #     input_ids = input_ids.view(-1, input_shape[-1])
-        # elif inputs_embeds is not None:
-        #     input_shape = inputs_embeds.size()[:-1]
-        # else:
-        #     raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
         input_shape = inputs_embeds.size()[:-1]
 
         # past_key_values_length
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
 
-        # if inputs_embeds is None:
-        #     inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
         )
@@ -1064,9 +965,6 @@ def forward(
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
 
-        # # embed positions
-        # positions = self.embed_positions(input_shape, past_key_values_length)
-
         hidden_states = inputs_embeds  # + positions
         hidden_states = self.layernorm_embedding(hidden_states)
 

From 534671297746cd507d541828f112787a04a65e49 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 12 Aug 2022 18:05:12 -0400
Subject: [PATCH 056/164] fixed typo

---
 .../time_series_transformer/modeling_time_series_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 5a27905ac4167..e19b851c09e52 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1252,7 +1252,7 @@ def forward(
         return_dict: Optional[bool] = None,
     ):
         transformer_inputs, scale, static_feat = self.create_network_inputs(
-            feat_static_real=feat_static_cat,
+            feat_static_cat=feat_static_cat,
             feat_static_real=feat_static_real,
             past_time_feat=past_time_feat,
             past_target=past_target,

From 6ef3a59c9d860a0e6b9f580843fc59a46e68a533 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 12 Aug 2022 18:19:00 -0400
Subject: [PATCH 057/164] set return_dict if none

---
 .../time_series_transformer/modeling_time_series_transformer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index e19b851c09e52..33e79d1e977c0 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1355,6 +1355,8 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         prediction_loss = None
         if future_target is not None and future_observed_values is not None:
             # training

From ec4ab0f0a70e76fcc07ba1cf0b94103417c0e6b1 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 12 Aug 2022 18:55:52 -0400
Subject: [PATCH 058/164] call model once for both train/predict

---
 .../modeling_time_series_transformer.py       | 45 ++++++++-----------
 1 file changed, 18 insertions(+), 27 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 33e79d1e977c0..634ea1607e73a 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1357,22 +1357,22 @@ def forward(
     ):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        outputs = self.model(
+            feat_static_cat=feat_static_cat,
+            feat_static_real=feat_static_real,
+            past_time_feat=past_time_feat,
+            past_target=past_target,
+            past_observed_values=past_observed_values,
+            future_time_feat=future_time_feat,
+            future_target=future_target,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            use_cache=use_cache,
+        )
+
         prediction_loss = None
         if future_target is not None and future_observed_values is not None:
-            # training
-            outputs = self.model(
-                feat_static_cat,
-                feat_static_real,
-                past_time_feat,
-                past_target,
-                past_observed_values,
-                future_time_feat,
-                future_target,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-                use_cache=use_cache,
-            )
             params = self.output_params(outputs.last_hidden_state)
             distr = self.output_distribution(params, outputs.scale)
 
@@ -1405,22 +1405,13 @@ def forward(
 
         else:
             # prediction
-            encoder = self.model.get_encoder()
             decoder = self.model.get_decoder()
 
-            num_parallel_samples = self.config.num_parallel_samples
-
-            encoder_inputs, scale, static_feat = self.create_network_inputs(
-                feat_static_cat=feat_static_cat,
-                feat_static_real=feat_static_real,
-                past_time_feat=past_time_feat,
-                past_target=past_target,
-                past_observed_values=past_observed_values,
-            )
-
-            encoder_outputs = encoder(inputs_embeds=encoder_inputs)
-            enc_last_hidden = encoder_outputs.last_hidden_state
+            enc_last_hidden = outputs.encoder_last_hidden_state
+            scale = outputs.scale
+            static_feat = outputs.static_features
 
+            num_parallel_samples = self.config.num_parallel_samples
             repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
 
             repeated_past_target = past_target.repeat_interleave(repeats=num_parallel_samples, dim=0) / repeated_scale

From 87a8f1b238c6809bcd82e38126668a7e2ea33f51 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 12 Aug 2022 18:58:21 -0400
Subject: [PATCH 059/164] use cache if future_target is none

---
 .../time_series_transformer/modeling_time_series_transformer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 634ea1607e73a..3804d12f5da4c 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1356,6 +1356,8 @@ def forward(
         return_dict: Optional[bool] = None,
     ):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if future_target is not None:
+            use_cache = False
 
         outputs = self.model(
             feat_static_cat=feat_static_cat,

From 01343f706159d1de9e0ea893f9adc2f146c984dd Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Sun, 14 Aug 2022 12:24:52 -0400
Subject: [PATCH 060/164] initial generate func

---
 .../modeling_time_series_transformer.py       | 105 +++++++++---------
 1 file changed, 53 insertions(+), 52 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 3804d12f5da4c..25957a5748581 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1387,72 +1387,73 @@ def forward(
 
             prediction_loss = weighted_average(loss, weights=loss_weights)
 
-            if not return_dict:
-                outputs = (params) + outputs[1:]
-                return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
-
-            return Seq2SeqTSPredictionOutput(
-                loss=prediction_loss,
-                params=params,
-                past_key_values=outputs.past_key_values,
-                decoder_hidden_states=outputs.decoder_hidden_states,
-                decoder_attentions=outputs.decoder_attentions,
-                cross_attentions=outputs.cross_attentions,
-                encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-                encoder_hidden_states=outputs.encoder_hidden_states,
-                encoder_attentions=outputs.encoder_attentions,
-                scale=outputs.scale,
-                static_features=outputs.static_features,
-            )
+        if not return_dict:
+            outputs = (params) + outputs[1:]
+            return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
 
-        else:
-            # prediction
-            decoder = self.model.get_decoder()
+        return Seq2SeqTSPredictionOutput(
+            loss=prediction_loss,
+            params=params,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            scale=outputs.scale,
+            static_features=outputs.static_features,
+        )
 
-            enc_last_hidden = outputs.encoder_last_hidden_state
-            scale = outputs.scale
-            static_feat = outputs.static_features
 
-            num_parallel_samples = self.config.num_parallel_samples
-            repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
+    @torch.no_grad()
+    def generate(self, past_target, future_time_feat, outputs) -> torch.Tensor:
+        decoder = self.model.get_decoder()
 
-            repeated_past_target = past_target.repeat_interleave(repeats=num_parallel_samples, dim=0) / repeated_scale
+        enc_last_hidden = outputs.encoder_last_hidden_state
+        scale = outputs.scale
+        static_feat = outputs.static_features
 
-            expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_feat.shape[1], -1)
-            features = torch.cat((expanded_static_feat, future_time_feat), dim=-1)
-            repeated_features = features.repeat_interleave(repeats=num_parallel_samples, dim=0)
+        num_parallel_samples = self.config.num_parallel_samples
+        repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
 
-            repeated_enc_last_hidden = enc_last_hidden.repeat_interleave(repeats=num_parallel_samples, dim=0)
+        repeated_past_target = past_target.repeat_interleave(repeats=num_parallel_samples, dim=0) / repeated_scale
 
-            future_samples = []
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_feat.shape[1], -1)
+        features = torch.cat((expanded_static_feat, future_time_feat), dim=-1)
+        repeated_features = features.repeat_interleave(repeats=num_parallel_samples, dim=0)
 
-            # greedy decoding
-            for k in range(self.config.prediction_length):
-                lagged_sequence = self.get_lagged_subsequences(
-                    sequence=repeated_past_target,
-                    subsequences_length=1 + k,
-                    shift=1,
-                )
+        repeated_enc_last_hidden = enc_last_hidden.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        future_samples = []
 
-                lags_shape = lagged_sequence.shape
-                reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+        # greedy decoding
+        for k in range(self.config.prediction_length):
+            lagged_sequence = self.get_lagged_subsequences(
+                sequence=repeated_past_target,
+                subsequences_length=1 + k,
+                shift=1,
+            )
 
-                decoder_input = torch.cat((reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1)
+            lags_shape = lagged_sequence.shape
+            reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
 
-                dec_output = decoder(inputs_embeds=decoder_input, encoder_hidden_states=repeated_enc_last_hidden)
-                dec_last_hidden = dec_output.last_hidden_state
+            decoder_input = torch.cat((reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1)
 
-                params = self.param_proj(dec_last_hidden[:, -1:])
-                distr = self.output_distribution(params, scale=repeated_scale)
-                next_sample = distr.sample()
+            dec_output = decoder(inputs_embeds=decoder_input, encoder_hidden_states=repeated_enc_last_hidden)
+            dec_last_hidden = dec_output.last_hidden_state
 
-                repeated_past_target = torch.cat((repeated_past_target, next_sample / repeated_scale), dim=1)
-                future_samples.append(next_sample)
+            params = self.param_proj(dec_last_hidden[:, -1:])
+            distr = self.output_distribution(params, scale=repeated_scale)
+            next_sample = distr.sample()
 
-            concat_future_samples = torch.cat(future_samples, dim=1)
-            return concat_future_samples.reshape(
-                (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
-            )
+            repeated_past_target = torch.cat((repeated_past_target, next_sample / repeated_scale), dim=1)
+            future_samples.append(next_sample)
+
+        concat_future_samples = torch.cat(future_samples, dim=1)
+        return concat_future_samples.reshape(
+            (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
+        )
 
 
 @add_start_docstrings(

From be1f132ecde2b05a85af1cdaa7a050ec63229b2d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 15 Aug 2022 19:24:59 -0400
Subject: [PATCH 061/164] generate arguments

---
 .../modeling_time_series_transformer.py       | 28 +++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 25957a5748581..deb52b2ca80c6 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1405,9 +1405,33 @@ def forward(
             static_features=outputs.static_features,
         )
 
-
     @torch.no_grad()
-    def generate(self, past_target, future_time_feat, outputs) -> torch.Tensor:
+    def generate(
+        self,
+        feat_static_cat: torch.Tensor,
+        feat_static_real: torch.Tensor,
+        past_time_feat: torch.Tensor,
+        past_target: torch.Tensor,
+        past_observed_values: torch.Tensor,
+        future_time_feat: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> torch.Tensor:
+        outputs = self(
+            feat_static_cat=feat_static_cat,
+            feat_static_real=feat_static_real,
+            past_time_feat=past_time_feat,
+            past_target=past_target,
+            past_observed_values=past_observed_values,
+            future_time_feat=future_time_feat,
+            future_target=None,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=False,
+            use_cache=use_cache,
+        )
+
         decoder = self.model.get_decoder()
 
         enc_last_hidden = outputs.encoder_last_hidden_state

From 3cbfd7b0dc77a4e8626797edc599825f11de86ab Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 15 Aug 2022 19:37:13 -0400
Subject: [PATCH 062/164] future_time_feat is required

---
 .../modeling_time_series_transformer.py                     | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index deb52b2ca80c6..d1e7b331f8030 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1413,8 +1413,7 @@ def generate(
         past_time_feat: torch.Tensor,
         past_target: torch.Tensor,
         past_observed_values: torch.Tensor,
-        future_time_feat: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
+        future_time_feat: Optional[torch.Tensor],
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> torch.Tensor:
@@ -1429,11 +1428,10 @@ def generate(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=False,
-            use_cache=use_cache,
+            use_cache=True,
         )
 
         decoder = self.model.get_decoder()
-
         enc_last_hidden = outputs.encoder_last_hidden_state
         scale = outputs.scale
         static_feat = outputs.static_features

From 89e9279dfbfd26bc03d20bfca5a48c418ec94865 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 16 Aug 2022 20:55:52 -0400
Subject: [PATCH 063/164] return SampleTSPredictionOutput

---
 .../modeling_time_series_transformer.py              | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index d1e7b331f8030..93dbc65d6259a 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -189,6 +189,11 @@ class Seq2SeqTSPredictionOutput(ModelOutput):
     static_features: Optional[torch.FloatTensor] = None
 
 
+@dataclass
+class SampleTSPredictionOutput(ModelOutput):
+    sequences: torch.FloatTensor = None
+
+
 class TimeSeriesTransformerAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -1473,8 +1478,11 @@ def generate(
             future_samples.append(next_sample)
 
         concat_future_samples = torch.cat(future_samples, dim=1)
-        return concat_future_samples.reshape(
-            (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
+
+        return SampleTSPredictionOutput(
+            sequences=concat_future_samples.reshape(
+                (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
+            )
         )
 
 

From 686db7851475073a09721366f975cc99483b028b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 18 Aug 2022 10:03:28 -0400
Subject: [PATCH 064/164] removed unneeded classes

---
 src/transformers/__init__.py                  |   4 -
 src/transformers/models/auto/modeling_auto.py |   7 -
 .../time_series_transformer/__init__.py       |   4 -
 .../modeling_time_series_transformer.py       | 604 +-----------------
 .../test_modeling_time_series_transformer.py  |   5 +-
 5 files changed, 2 insertions(+), 622 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3f87ab68efe51..5cb2abe194543 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -4217,10 +4217,6 @@
         )
         from .models.time_series_transformer import (
             TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            TimeSeriesTransformerForCausalLM,
-            TimeSeriesTransformerForConditionalGeneration,
-            TimeSeriesTransformerForQuestionAnswering,
-            TimeSeriesTransformerForSequenceClassification,
             TimeSeriesTransformerForPrediction,
             TimeSeriesTransformerModel,
             TimeSeriesTransformerPreTrainedModel,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 0df2ee1f30584..b26966d5f9873 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -198,8 +198,6 @@
 MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
     [
         # Model with LM heads mapping
-
-        ("time_series_transformer", "TimeSeriesTransformerForConditionalGeneration"),
         ("albert", "AlbertForMaskedLM"),
         ("bart", "BartForConditionalGeneration"),
         ("bert", "BertForMaskedLM"),
@@ -262,7 +260,6 @@
 MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Causal LM mapping
-        ("time_series_transformer", "TimeSeriesTransformerForCausalLM"),
         ("bart", "BartForCausalLM"),
         ("bert", "BertLMHeadModel"),
         ("bert-generation", "BertGenerationDecoder"),
@@ -438,8 +435,6 @@
 MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Seq2Seq Causal LM mapping
-
-        ("time_series_transformer", "TimeSeriesTransformerForConditionalGeneration"),
         ("bart", "BartForConditionalGeneration"),
         ("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"),
         ("blenderbot", "BlenderbotForConditionalGeneration"),
@@ -471,7 +466,6 @@
 MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Sequence Classification mapping
-        ("time_series_transformer", "TimeSeriesTransformerForSequenceClassification"),
         ("albert", "AlbertForSequenceClassification"),
         ("bart", "BartForSequenceClassification"),
         ("bert", "BertForSequenceClassification"),
@@ -528,7 +522,6 @@
 MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
     [
         # Model for Question Answering mapping
-        ("time_series_transformer", "TimeSeriesTransformerForQuestionAnswering"),
         ("albert", "AlbertForQuestionAnswering"),
         ("bart", "BartForQuestionAnswering"),
         ("bert", "BertForQuestionAnswering"),
diff --git a/src/transformers/models/time_series_transformer/__init__.py b/src/transformers/models/time_series_transformer/__init__.py
index 0fbb3e85314a8..cb4c13ca58613 100644
--- a/src/transformers/models/time_series_transformer/__init__.py
+++ b/src/transformers/models/time_series_transformer/__init__.py
@@ -57,10 +57,6 @@
     else:
         from .modeling_time_series_transformer import (
             TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            TimeSeriesTransformerForConditionalGeneration,
-            TimeSeriesTransformerForCausalLM,
-            TimeSeriesTransformerForQuestionAnswering,
-            TimeSeriesTransformerForSequenceClassification,
             TimeSeriesTransformerForPrediction,
             TimeSeriesTransformerModel,
             TimeSeriesTransformerPreTrainedModel,
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 93dbc65d6259a..7bf22df76e183 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -577,7 +577,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             weights.
 """
 
-TIME_SERIES_TRANSFORMER_GENERATION_EXAMPLE = r"""
+TIME_SERIES_TRANSFORMER_PREDICTION_EXAMPLE = r"""
     Summarization example:
 
     ```python
@@ -1486,418 +1486,6 @@ def generate(
         )
 
 
-@add_start_docstrings(
-    "The TimeSeriesTransformer Model with a language modeling head. Can be used for summarization.",
-    TIME_SERIES_TRANSFORMER_START_DOCSTRING,
-)
-class TimeSeriesTransformerForConditionalGeneration(TimeSeriesTransformerPreTrainedModel):
-    base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder\.version",
-        r"decoder\.version",
-        r"lm_head\.weight",
-    ]
-
-    def __init__(self, config: TimeSeriesTransformerConfig):
-        super().__init__(config)
-        self.model = TimeSeriesTransformerModel(config)
-        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
-        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_encoder(self):
-        return self.model.get_encoder()
-
-    def get_decoder(self):
-        return self.model.get_decoder()
-
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        self._resize_final_logits_bias(new_num_tokens)
-        return new_embeddings
-
-    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
-        old_num_tokens = self.final_logits_bias.shape[-1]
-        if new_num_tokens <= old_num_tokens:
-            new_bias = self.final_logits_bias[:, :new_num_tokens]
-        else:
-            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
-            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
-        self.register_buffer("final_logits_bias", new_bias)
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
-    @add_end_docstrings(TIME_SERIES_TRANSFORMER_GENERATION_EXAMPLE)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Conditional generation example:
-
-        ```python
-        >>> from transformers import TimeSeriesTransformerTokenizer, TimeSeriesTransformerForConditionalGeneration
-        >>> tokenizer = TimeSeriesTransformerTokenizer.from_pretrained('huggingface/tst-ett')
-        >>> TXT = "My friends are <mask> but they eat too many carbs."
-
-        >>> model = TimeSeriesTransformerForConditionalGeneration.from_pretrained('huggingface/tst-ett')
-        >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
-        >>> logits = model(input_ids).logits
-
-        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-        >>> probs = logits[0, masked_index].softmax(dim=0)
-        >>> values, predictions = probs.topk(5)
-
-        >>> tokenizer.decode(predictions).split()
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if labels is not None:
-            if use_cache:
-                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
-            use_cache = False
-            if decoder_input_ids is None:
-                decoder_input_ids = shift_tokens_right(
-                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
-                )
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            encoder_outputs=encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
-
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
-
-        if not return_dict:
-            output = (lm_logits,) + outputs[1:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-
-        return Seq2SeqLMOutput(
-            loss=masked_lm_loss,
-            logits=lm_logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self,
-        decoder_input_ids,
-        past=None,
-        attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs
-    ):
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            decoder_input_ids = decoder_input_ids[:, -1:]
-
-        return {
-            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-            "encoder_outputs": encoder_outputs,
-            "past_key_values": past,
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
-        }
-
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    TimeSeriesTransformer model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
-    tasks.
-    """,
-    TIME_SERIES_TRANSFORMER_START_DOCSTRING,
-)
-class TimeSeriesTransformerForSequenceClassification(TimeSeriesTransformerPreTrainedModel):
-    def __init__(self, config: TimeSeriesTransformerConfig, **kwargs):
-        super().__init__(config, **kwargs)
-        self.model = TimeSeriesTransformerModel(config)
-        self.classification_head = TimeSeriesTransformerClassificationHead(
-            config.d_model,
-            config.d_model,
-            config.num_labels,
-            config.classifier_dropout,
-        )
-        self.model._init_weights(self.classification_head.dense)
-        self.model._init_weights(self.classification_head.out_proj)
-
-    @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=Seq2SeqSequenceClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            use_cache = False
-
-        if input_ids is None and inputs_embeds is not None:
-            raise NotImplementedError(
-                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
-            )
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = outputs[0]  # last hidden state
-
-        eos_mask = input_ids.eq(self.config.eos_token_id)
-
-        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
-            raise ValueError("All examples must have the same number of <eos> tokens.")
-        sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
-            :, -1, :
-        ]
-        logits = self.classification_head(sentence_representation)
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.config.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.config.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return Seq2SeqSequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    TimeSeriesTransformer Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    TIME_SERIES_TRANSFORMER_START_DOCSTRING,
-)
-class TimeSeriesTransformerForQuestionAnswering(TimeSeriesTransformerPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        config.num_labels = 2
-        self.num_labels = config.num_labels
-
-        self.model = TimeSeriesTransformerModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.model._init_weights(self.qa_outputs)
-
-    @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=Seq2SeqQuestionAnsweringModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        start_positions=None,
-        end_positions=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if start_positions is not None and end_positions is not None:
-            use_cache = False
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-        if not return_dict:
-            output = (
-                start_logits,
-                end_logits,
-            ) + outputs[1:]
-            return ((total_loss,) + output) if total_loss is not None else output
-
-        return Seq2SeqQuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-
-
 class TimeSeriesTransformerDecoderWrapper(TimeSeriesTransformerPreTrainedModel):
     """
     This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
@@ -1910,193 +1498,3 @@ def __init__(self, config):
 
     def forward(self, *args, **kwargs):
         return self.decoder(*args, **kwargs)
-
-
-class TimeSeriesTransformerForCausalLM(TimeSeriesTransformerPreTrainedModel):
-    def __init__(self, config):
-        config = copy.deepcopy(config)
-        config.is_decoder = True
-        config.is_encoder_decoder = False
-        super().__init__(config)
-        self.model = TimeSeriesTransformerDecoderWrapper(config)
-
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.decoder.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.decoder.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model.decoder = decoder
-
-    def get_decoder(self):
-        return self.model.decoder
-
-    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        head_mask=None,
-        cross_attn_head_mask=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        Args:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it.
-
-                Indices can be obtained using [`~TimeSeriesTransformerTokenizer`]. See
-                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
-                for details.
-
-                [What are input IDs?](../glossary#input-ids)
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-                if the model is configured as a decoder.
-            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
-                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
-            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-            past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
-                decoding.
-
-                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
-                (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
-                instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
-                ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up
-                decoding (see `past_key_values`).
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import TimeSeriesTransformerTokenizer, TimeSeriesTransformerForCausalLM
-
-        >>> tokenizer = TimeSeriesTransformerTokenizer.from_pretrained('facebook/bart-large')
-        >>> model = TimeSeriesTransformerForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
-        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
-
-        >>> logits = outputs.logits
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model.decoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            head_mask=head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        logits = self.lm_head(outputs[0])
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions,
-        )
-
-    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=None, **kwargs):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-
-        if past:
-            input_ids = input_ids[:, -1:]
-        # first step, decoder_cached_states are empty
-        return {
-            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
-            "attention_mask": attention_mask,
-            "past_key_values": past,
-            "use_cache": use_cache,
-        }
-
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
-        return reordered_past
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 0faaa53b43ef7..feb7d9801b071 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -33,10 +33,6 @@
 
     from transformers import (
         TimeSeriesTransformerConfig,
-        TimeSeriesTransformerForConditionalGeneration,
-        TimeSeriesTransformerForQuestionAnswering,
-        TimeSeriesTransformerForCausalLM,
-        TimeSeriesTransformerForSequenceClassification,
         TimeSeriesTransformerForPrediction,
         TimeSeriesTransformerModel,
     )
@@ -208,6 +204,7 @@ class TimeSeriesTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, un
     all_model_classes = (
         (
             TimeSeriesTransformerModel,
+            TimeSeriesTransformerForPrediction,
             TimeSeriesTransformerForConditionalGeneration,
             TimeSeriesTransformerForSequenceClassification,
             TimeSeriesTransformerForQuestionAnswering,

From 336fb95d8e9adefdc69100c74015246627385dd1 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 18 Aug 2022 12:42:27 -0400
Subject: [PATCH 065/164] fix when params is none

---
 .../modeling_time_series_transformer.py                     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 7bf22df76e183..f6dd7a4ec8370 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -581,10 +581,9 @@ def _set_gradient_checkpointing(self, module, value=False):
     Summarization example:
 
     ```python
-    >>> from transformers import TimeSeriesTransformerTokenizer, TimeSeriesTransformerForConditionalGeneration
+    >>> from transformers import TimeSeriesTransformerForPrediction
 
     >>> model = TimeSeriesTransformerForConditionalGeneration.from_pretrained('huggingface/tst-ett')
-    >>> tokenizer = TimeSeriesTransformerTokenizer.from_pretrained('huggingface/tst-ett')
 
     >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
     >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
@@ -1379,6 +1378,7 @@ def forward(
         )
 
         prediction_loss = None
+        params = None
         if future_target is not None and future_observed_values is not None:
             params = self.output_params(outputs.last_hidden_state)
             distr = self.output_distribution(params, outputs.scale)
@@ -1393,7 +1393,7 @@ def forward(
             prediction_loss = weighted_average(loss, weights=loss_weights)
 
         if not return_dict:
-            outputs = (params) + outputs[1:]
+            outputs = (params + outputs[1:]) if params is not None else outputs[1:]
             return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
 
         return Seq2SeqTSPredictionOutput(

From 4623d2d95afc1d05119c2999aefe69cc8a4568dc Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 18 Aug 2022 14:36:43 -0400
Subject: [PATCH 066/164] fix return dict

---
 .../modeling_time_series_transformer.py                     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index f6dd7a4ec8370..d951db9090841 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1283,7 +1283,7 @@ def forward(
         # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs,
+                last_hidden_state=encoder_outputs[0],
                 hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                 attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
             )
@@ -1291,7 +1291,7 @@ def forward(
         dec_input = transformer_inputs[:, self.config.context_length :, ...]
         decoder_outputs = self.decoder(
             inputs_embeds=dec_input,
-            encoder_hidden_states=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs[0],
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -1432,7 +1432,7 @@ def generate(
             future_target=None,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=False,
+            return_dict=True,
             use_cache=True,
         )
 

From 035f016a0785073a9638aabdd2151c4f5c15787d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 18 Aug 2022 15:09:05 -0400
Subject: [PATCH 067/164] fix num_attention_heads

---
 .../configuration_time_series_transformer.py             | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 0535656334678..7770bbceb6ffa 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -72,7 +72,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             Number of encoder layers.
         decoder_layers (`int`, *optional*, defaults to 2):
             Number of decoder layers.
-        nhead (`int`, *optional*, defaults to 2):
+        num_attention_heads (`int`, *optional*, defaults to 2):
             Number of attention heads for each attention layer in the Transformer encoder and decoder.
         ffn_dim (`int`, *optional*, defaults to 32):
             Dimension of the "intermediate" (often named feed-forward) layer in encoder and decoder.
@@ -116,7 +116,7 @@ def __init__(
         cardinality: Optional[List[int]] = None,
         embedding_dimension: Optional[List[int]] = None,
         ffn_dim: int = 32,
-        nhead: int = 2,
+        num_attention_heads: int = 2,
         encoder_layers: int = 2,
         decoder_layers: int = 2,
         is_encoder_decoder: bool = True,
@@ -147,9 +147,8 @@ def __init__(
         self.num_parallel_samples = num_parallel_samples
 
         # Transformer architecture parameters
-        self.nhead = nhead
-        self.encoder_attention_heads = nhead
-        self.decoder_attention_heads = nhead
+        self.encoder_attention_heads = num_attention_heads
+        self.decoder_attention_heads = num_attention_heads
 
         self.encoder_layers = encoder_layers
         self.decoder_layers = decoder_layers

From 8e2c2fa657a87a9ae180aa6e8b0f0288901e32a7 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 18 Aug 2022 15:45:52 -0400
Subject: [PATCH 068/164] fix arguments

---
 .../configuration_time_series_transformer.py     | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 7770bbceb6ffa..9074385622ce1 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -115,8 +115,10 @@ def __init__(
         num_feat_static_real: int = 0,
         cardinality: Optional[List[int]] = None,
         embedding_dimension: Optional[List[int]] = None,
-        ffn_dim: int = 32,
-        num_attention_heads: int = 2,
+        encoder_ffn_dim: int = 32,
+        decoder_ffn_dim: int = 32,
+        decoder_attention_heads: int = 2,
+        encoder_attention_heads: int = 2,
         encoder_layers: int = 2,
         decoder_layers: int = 2,
         is_encoder_decoder: bool = True,
@@ -147,14 +149,12 @@ def __init__(
         self.num_parallel_samples = num_parallel_samples
 
         # Transformer architecture parameters
-        self.encoder_attention_heads = num_attention_heads
-        self.decoder_attention_heads = num_attention_heads
-
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.decoder_ffn_dim = decoder_ffn_dim
         self.encoder_layers = encoder_layers
         self.decoder_layers = decoder_layers
-        self.ffn_dim = ffn_dim
-        self.encoder_ffn_dim = ffn_dim
-        self.decoder_ffn_dim = ffn_dim
 
         self.dropout = dropout
         self.attention_dropout = dropout

From ccdf04849c9f050a1cdef4f5c81d6da2cb633ada Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 18 Aug 2022 19:20:01 -0400
Subject: [PATCH 069/164] remove unused shift_tokens_right

---
 .../modeling_time_series_transformer.py       | 21 -------------------
 1 file changed, 21 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index d951db9090841..7e6f5838827de 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -21,7 +21,6 @@
 
 import torch
 from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from gluonts.torch.modules.scaler import MeanScaler, NOPScaler
 from gluonts.torch.modules.feature import FeatureEmbedder
@@ -42,11 +41,6 @@
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
     ModelOutput,
-    Seq2SeqLMOutput,
-    Seq2SeqModelOutput,
-    Seq2SeqQuestionAnsweringModelOutput,
-    Seq2SeqSequenceClassifierOutput,
-    CausalLMOutputWithCrossAttentions,
 )
 from ...modeling_utils import PreTrainedModel
 from ...utils import logging
@@ -65,21 +59,6 @@
 ]
 
 
-def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
-    """
-    Shift input ids one token to the right.
-    """
-    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
-    shifted_input_ids[:, 0] = decoder_start_token_id
-
-    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
-    # replace possible -100 values in labels by `pad_token_id`
-    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
-
-    return shifted_input_ids
-
-
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
     """
     Make causal mask used for bi-directional self-attention.

From 9cbb5007db5d09cef944298ccf91b3e76806d5ba Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 19 Aug 2022 16:28:44 -0400
Subject: [PATCH 070/164] add different dropout configs

---
 .../configuration_time_series_transformer.py         | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 9074385622ce1..f26961027da01 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -124,6 +124,10 @@ def __init__(
         is_encoder_decoder: bool = True,
         activation_function: str = "gelu",
         dropout: float = 0.1,
+        encoder_layerdrop: float = 0.1,
+        decoder_layerdrop: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
         num_parallel_samples: int = 100,
         init_std: float = 0.02,
         use_cache=True,
@@ -157,10 +161,10 @@ def __init__(
         self.decoder_layers = decoder_layers
 
         self.dropout = dropout
-        self.attention_dropout = dropout
-        self.activation_dropout = dropout
-        self.encoder_layerdrop = dropout
-        self.decoder_layerdrop = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
 
         self.activation_function = activation_function
         self.init_std = init_std

From 996911a6bd76dc9028322da88ccfca5828c6bc02 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Sat, 3 Sep 2022 12:43:47 +0200
Subject: [PATCH 071/164] implement FeatureEmbedder, Scaler and
 weighted_average

---
 .../time_series_transformer/__init__.py       |   3 +-
 .../modeling_time_series_transformer.py       | 207 +++++++++++++++---
 .../time_series_transformations.py            |  12 +-
 3 files changed, 185 insertions(+), 37 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/__init__.py b/src/transformers/models/time_series_transformer/__init__.py
index cb4c13ca58613..221cc874092e7 100644
--- a/src/transformers/models/time_series_transformer/__init__.py
+++ b/src/transformers/models/time_series_transformer/__init__.py
@@ -18,8 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...utils import _LazyModule, OptionalDependencyNotAvailable
-from ...utils import is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 7e6f5838827de..e8d9da60ace1c 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -15,35 +15,25 @@
 """ PyTorch TimeSeriesTransformer model. """
 
 import copy
-from dataclasses import dataclass
 import random
-from typing import Optional, Tuple, List
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
 
 import torch
-from torch import nn
-
-from gluonts.torch.modules.scaler import MeanScaler, NOPScaler
-from gluonts.torch.modules.feature import FeatureEmbedder
 from gluonts.torch.distributions import StudentTOutput
-from gluonts.torch.modules.loss import NegativeLogLikelihood
-from gluonts.torch.util import weighted_average
-
+from torch import nn
 
 from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
+from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_outputs import (
-    BaseModelOutput,
-    BaseModelOutputWithPastAndCrossAttentions,
-    ModelOutput,
-)
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
 from .configuration_time_series_transformer import TimeSeriesTransformerConfig
 
 
@@ -59,6 +49,163 @@
 ]
 
 
+class FeatureEmbedder(nn.Module):
+    def __init__(
+        self,
+        cardinalities: List[int],
+        embedding_dims: List[int],
+    ) -> None:
+        super().__init__()
+
+        self._num_features = len(cardinalities)
+        self._embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        if self._num_features > 1:
+            # we slice the last dimension, giving an array of length
+            # self._num_features with shape (N,T) or (N)
+            cat_feature_slices = torch.chunk(features, self._num_features, dim=-1)
+        else:
+            cat_feature_slices = [features]
+
+        return torch.cat(
+            [
+                embed(cat_feature_slice.squeeze(-1))
+                for embed, cat_feature_slice in zip(self._embedders, cat_feature_slices)
+            ],
+            dim=-1,
+        )
+
+
+class MeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along
+    dimension ``dim``, and scales the data accordingly.
+    Parameters
+    ----------
+    dim
+        dimension along which to compute the scale
+    keepdim
+        controls whether to retain dimension ``dim`` (of length 1) in the
+        scale tensor, or suppress it.
+    minimum_scale
+        default scale that is used for elements that are constantly zero
+        along dimension ``dim``.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
+        super().__init__()
+        assert dim > 0, "Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0"
+        self.dim = dim
+        self.keepdim = keepdim
+        self.register_buffer("minimum_scale", torch.tensor(minimum_scale))
+
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # these will have shape (N, C)
+        total_weight = weights.sum(dim=self.dim)
+        weighted_sum = (data.abs() * weights).sum(dim=self.dim)
+
+        # first compute a global scale per-dimension
+        total_observed = total_weight.sum(dim=0)
+        denominator = torch.max(total_observed, torch.ones_like(total_observed))
+        default_scale = weighted_sum.sum(dim=0) / denominator
+
+        # then compute a per-item, per-dimension scale
+        denominator = torch.max(total_weight, torch.ones_like(total_weight))
+        scale = weighted_sum / denominator
+
+        # use per-batch scale when no element is observed
+        # or when the sequence contains only zeros
+        scale = (
+            torch.max(
+                self.minimum_scale,
+                torch.where(
+                    weighted_sum > torch.zeros_like(weighted_sum),
+                    scale,
+                    default_scale * torch.ones_like(total_weight),
+                ),
+            )
+            .detach()
+            .unsqueeze(dim=self.dim)
+        )
+
+        return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
+
+
+class NOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along dimension ``dim``, and therefore
+    applies no scaling to the input data.
+    Parameters
+    ----------
+    dim
+        dimension along which to compute the scale
+    keepdim
+        controls whether to retain dimension ``dim`` (of length 1) in the
+        scale tensor, or suppress it.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data).mean(
+            dim=self.dim,
+            keepdim=self.keepdim,
+        )
+        return data, scale
+
+
+def weighted_average(x: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given dim, masking
+    values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+    Parameters
+    ----------
+    x
+        Input tensor, of which the average must be computed.
+    weights
+        Weights tensor, of the same shape as `x`.
+    dim
+        The dim along which to average `x`
+    Returns
+    -------
+    Tensor:
+        The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(weights != 0, x * weights, torch.zeros_like(x))
+        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
+        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
+    else:
+        return x.mean(dim=dim)
+
+
+class NegativeLogLikelihood:
+    """
+    Compute the negative log likelihood loss.
+    Parameters
+    ----------
+    beta: float in range (0, 1)
+        beta parameter from the paper: "On the Pitfalls of Heteroscedastic
+        Uncertainty Estimation with Probabilistic Neural Networks" by
+        Seitzer et al. 2022
+        https://openreview.net/forum?id=aPOpXlnV1T
+    """
+
+    beta: float = 0.0
+
+    def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+        nll = -input.log_prob(target)
+        if self.beta > 0.0:
+            variance = input.variance
+            nll = nll * (variance.detach() ** self.beta)
+        return nll
+
+
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
     """
     Make causal mask used for bi-directional self-attention.
@@ -189,9 +336,10 @@ def __init__(
         self.num_heads = num_heads
         self.dropout = dropout
         self.head_dim = embed_dim // num_heads
-        assert (
-            self.head_dim * num_heads == self.embed_dim
-        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+        assert self.head_dim * num_heads == self.embed_dim, (
+            f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+            f" {num_heads})."
+        )
         self.scaling = self.head_dim**-0.5
         self.is_decoder = is_decoder
 
@@ -261,7 +409,8 @@ def forward(
 
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
             )
 
         if attention_mask is not None:
@@ -277,7 +426,8 @@ def forward(
         if layer_head_mask is not None:
             if layer_head_mask.size() != (self.num_heads,):
                 raise ValueError(
-                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
                 )
             attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
@@ -298,7 +448,8 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
             )
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
@@ -962,9 +1113,10 @@ def forward(
         # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
         for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
             if attn_mask is not None:
-                assert attn_mask.size()[0] == (
-                    len(self.layers)
-                ), f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                assert attn_mask.size()[0] == (len(self.layers)), (
+                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
         for idx, decoder_layer in enumerate(self.layers):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
@@ -976,10 +1128,10 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
-                        "`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`..."
+                        "`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache ="
+                        " False`..."
                     )
                     use_cache = False
 
@@ -1001,7 +1153,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
diff --git a/src/transformers/models/time_series_transformer/time_series_transformations.py b/src/transformers/models/time_series_transformer/time_series_transformations.py
index 6f88776e12c0c..1c851c9928748 100644
--- a/src/transformers/models/time_series_transformer/time_series_transformations.py
+++ b/src/transformers/models/time_series_transformer/time_series_transformations.py
@@ -14,15 +14,14 @@
 # limitations under the License.
 """ Transformations for Time Series Transformers. """
 
-from typing import Optional, List, Iterable
 from functools import lru_cache
+from typing import Iterable, List, Optional
 
 import pandas as pd
-
-from torch.utils.data import DataLoader
-
-from gluonts.time_feature import time_features_from_frequency_str
 from gluonts.dataset.field_names import FieldName
+from gluonts.itertools import Cyclic, IterableSlice, PseudoShuffled
+from gluonts.time_feature import time_features_from_frequency_str
+from gluonts.torch.util import IterableDataset
 from gluonts.transform import (
     AddAgeFeature,
     AddObservedValuesIndicator,
@@ -40,8 +39,7 @@
     VstackFeatures,
 )
 from gluonts.transform.sampler import InstanceSampler
-from gluonts.itertools import Cyclic, IterableSlice, PseudoShuffled
-from gluonts.torch.util import IterableDataset
+from torch.utils.data import DataLoader
 
 
 @lru_cache(10_000)

From 54460c79e7b6a438708e8d6e5886d64b382d44bf Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 5 Sep 2022 10:54:22 +0200
Subject: [PATCH 072/164] remove gluonts dependency

---
 .../modeling_time_series_transformer.py       | 162 +++++++++++++++++-
 1 file changed, 158 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index e8d9da60ace1c..7b700428d166b 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -17,11 +17,12 @@
 import copy
 import random
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Callable, Dict, Type
 
 import torch
-from gluonts.torch.distributions import StudentTOutput
 from torch import nn
+from torch.distributions import TransformedDistribution, AffineTransform, Distribution, StudentT
+import torch.nn.functional as F
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
@@ -49,6 +50,159 @@
 ]
 
 
+class AffineTransformed(TransformedDistribution):
+    def __init__(self, base_distribution: Distribution, loc=None, scale=None):
+
+        self.scale = 1.0 if scale is None else scale
+        self.loc = 0.0 if loc is None else loc
+
+        super().__init__(base_distribution, [AffineTransform(self.loc, self.scale)])
+
+    @property
+    def mean(self):
+        """
+        Returns the mean of the distribution.
+        """
+        return self.base_dist.mean * self.scale + self.loc
+
+    @property
+    def variance(self):
+        """
+        Returns the variance of the distribution.
+        """
+        return self.base_dist.variance * self.scale**2
+
+    @property
+    def stddev(self):
+        """
+        Returns the standard deviation of the distribution.
+        """
+        return self.variance.sqrt()
+
+
+class PtArgProj(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        args_dim: Dict[str, int],
+        domain_map: Callable[..., Tuple[torch.Tensor]],
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.args_dim = args_dim
+        self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
+        self.domain_map = domain_map
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
+        params_unbounded = [proj(x) for proj in self.proj]
+
+        return self.domain_map(*params_unbounded)
+
+
+class LambdaLayer(nn.Module):
+    def __init__(self, function):
+        super().__init__()
+        self._func = function
+
+    def forward(self, x, *args):
+        return self._func(x, *args)
+
+
+class Output:
+    in_features: int
+    args_dim: Dict[str, int]
+    _dtype: Type = np.float32
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @dtype.setter
+    def dtype(self, dtype: Type):
+        self._dtype = dtype
+
+    def get_args_proj(self, in_features: int) -> nn.Module:
+        return PtArgProj(
+            in_features=in_features,
+            args_dim=self.args_dim,
+            domain_map=LambdaLayer(self.domain_map),
+        )
+
+    def domain_map(self, *args: torch.Tensor):
+        raise NotImplementedError()
+
+
+class DistributionOutput(Output):
+    distr_cls: type
+
+    def __init__(self) -> None:
+        pass
+
+    def _base_distribution(self, distr_args):
+        return self.distr_cls(*distr_args)
+
+    def distribution(
+        self,
+        distr_args,
+        loc: Optional[torch.Tensor] = None,
+        scale: Optional[torch.Tensor] = None,
+    ) -> Distribution:
+        distr = self._base_distribution(distr_args)
+        if loc is None and scale is None:
+            return distr
+        else:
+            return AffineTransformed(distr, loc=loc, scale=scale)
+
+    @property
+    def event_shape(self) -> Tuple:
+        r"""
+        Shape of each individual event contemplated by the distributions
+        that this object constructs.
+        """
+        raise NotImplementedError()
+
+    @property
+    def event_dim(self) -> int:
+        r"""
+        Number of event dimensions, i.e., length of the `event_shape` tuple,
+        of the distributions that this object constructs.
+        """
+        return len(self.event_shape)
+
+    @property
+    def value_in_support(self) -> float:
+        r"""
+        A float that will have a valid numeric value when computing the
+        log-loss of the corresponding distribution. By default 0.0.
+        This value will be used when padding data series.
+        """
+        return 0.0
+
+    def domain_map(self, *args: torch.Tensor):
+        r"""
+        Converts arguments to the right shape and domain. The domain depends
+        on the type of distribution, while the correct shape is obtained by
+        reshaping the trailing axis in such a way that the returned tensors
+        define a distribution of the right event_shape.
+        """
+        raise NotImplementedError()
+
+
+class StudentTOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
+    distr_cls: type = StudentT
+
+    @classmethod
+    def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
+        scale = F.softplus(scale)
+        df = 2.0 + F.softplus(df)
+        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
+
+    @property
+    def event_shape(self) -> Tuple:
+        return ()
+
+
 class FeatureEmbedder(nn.Module):
     def __init__(
         self,
@@ -158,7 +312,7 @@ def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple
         return data, scale
 
 
-def weighted_average(x: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+def _weighted_average(x: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
     """
     Computes the weighted average of a given tensor across a given dim, masking
     values associated with weight zero,
@@ -1520,7 +1674,7 @@ def forward(
             else:
                 loss_weights = future_observed_values.min(dim=-1, keepdim=False)
 
-            prediction_loss = weighted_average(loss, weights=loss_weights)
+            prediction_loss = _weighted_average(loss, weights=loss_weights)
 
         if not return_dict:
             outputs = (params + outputs[1:]) if params is not None else outputs[1:]

From 21dc3cdd9e6a5a63f50723a109f6a897401e8215 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 5 Sep 2022 12:44:12 +0200
Subject: [PATCH 073/164] fix class names

---
 .../modeling_time_series_transformer.py       | 20 +++++--------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 7b700428d166b..ad5fe9f14527a 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -14,10 +14,9 @@
 # limitations under the License.
 """ PyTorch TimeSeriesTransformer model. """
 
-import copy
 import random
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Callable, Dict, Type
+from typing import List, Optional, Tuple, Callable, Dict
 
 import torch
 from torch import nn
@@ -80,7 +79,7 @@ def stddev(self):
         return self.variance.sqrt()
 
 
-class PtArgProj(nn.Module):
+class ParameterProjection(nn.Module):
     def __init__(
         self,
         in_features: int,
@@ -111,18 +110,9 @@ def forward(self, x, *args):
 class Output:
     in_features: int
     args_dim: Dict[str, int]
-    _dtype: Type = np.float32
 
-    @property
-    def dtype(self):
-        return self._dtype
-
-    @dtype.setter
-    def dtype(self, dtype: Type):
-        self._dtype = dtype
-
-    def get_args_proj(self, in_features: int) -> nn.Module:
-        return PtArgProj(
+    def get_param_proj(self, in_features: int) -> nn.Module:
+        return ParameterProjection(
             in_features=in_features,
             args_dim=self.args_dim,
             domain_map=LambdaLayer(self.domain_map),
@@ -1606,7 +1596,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         self.model = TimeSeriesTransformerModel(config)
         if config.distribution_output == "student_t":
             self.distribution_output = StudentTOutput()
-            self.param_proj = self.distribution_output.get_args_proj(self.model.config.d_model)
+            self.param_proj = self.distribution_output.get_param_proj(self.model.config.d_model)
             self.target_shape = self.distribution_output.event_shape
 
         if config.loss == "nll":

From 9763a3e054353bf52462a75e73f7a7f38b4abeef Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 5 Sep 2022 13:36:46 +0200
Subject: [PATCH 074/164] avoid _variable names

---
 .../modeling_time_series_transformer.py              | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index ad5fe9f14527a..2d7842e02d472 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -101,10 +101,10 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
 class LambdaLayer(nn.Module):
     def __init__(self, function):
         super().__init__()
-        self._func = function
+        self.function = function
 
     def forward(self, x, *args):
-        return self._func(x, *args)
+        return self.function(x, *args)
 
 
 class Output:
@@ -201,21 +201,21 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        self._num_features = len(cardinalities)
-        self._embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+        self.num_features = len(cardinalities)
+        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
 
     def forward(self, features: torch.Tensor) -> torch.Tensor:
         if self._num_features > 1:
             # we slice the last dimension, giving an array of length
             # self._num_features with shape (N,T) or (N)
-            cat_feature_slices = torch.chunk(features, self._num_features, dim=-1)
+            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
         else:
             cat_feature_slices = [features]
 
         return torch.cat(
             [
                 embed(cat_feature_slice.squeeze(-1))
-                for embed, cat_feature_slice in zip(self._embedders, cat_feature_slices)
+                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
             ],
             dim=-1,
         )

From 3dab2e1150dd66f99156da4f1dbc05d031a6a914 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 6 Sep 2022 09:01:11 +0200
Subject: [PATCH 075/164] remove gluonts dependency

---
 setup.cfg                                     | 1 -
 setup.py                                      | 4 ----
 src/transformers/dependency_versions_table.py | 1 -
 3 files changed, 6 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index cf559369bfd09..2d605ccceca78 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -15,7 +15,6 @@ known_third_party =
     fire
     fugashi
     git
-    gluonts
     h5py
     matplotlib
     nltk
diff --git a/setup.py b/setup.py
index ceacca5af77c9..240fc60f6df72 100644
--- a/setup.py
+++ b/setup.py
@@ -114,7 +114,6 @@
     "ftfy",
     "fugashi>=1.0",
     "GitPython<3.1.19",
-    "gluonts>=0.10.0",
     "hf-doc-builder>=0.3.0",
     "huggingface-hub>=0.1.0,<1.0",
     "importlib_metadata",
@@ -278,7 +277,6 @@ def run(self):
 extras["vision"] = deps_list("Pillow")
 extras["timm"] = deps_list("timm")
 extras["codecarbon"] = deps_list("codecarbon")
-extras["gluonts"] = deps_list("gluonts")
 
 
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
@@ -322,7 +320,6 @@ def run(self):
     + extras["timm"]
     + extras["codecarbon"]
     + extras["accelerate"]
-    + extras["gluonts"]
 )
 
 # Might need to add doc-builder and some specific deps in the future
@@ -347,7 +344,6 @@ def run(self):
     + extras["sklearn"]
     + extras["modelcreation"]
     + extras["onnxruntime"]
-    + extras["gluonts"]
 )
 extras["dev-tensorflow"] = (
     extras["testing"]
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 7a9144bc7f860..d63b79ababb50 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -20,7 +20,6 @@
     "ftfy": "ftfy",
     "fugashi": "fugashi>=1.0",
     "GitPython": "GitPython<3.1.19",
-    "gluonts": "gluonts>=0.10.0",
     "hf-doc-builder": "hf-doc-builder>=0.3.0",
     "huggingface-hub": "huggingface-hub>=0.1.0,<1.0",
     "importlib_metadata": "importlib_metadata",

From 27f134930ca462626379d3c3bce4cbaaf4589ee3 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 6 Sep 2022 09:01:28 +0200
Subject: [PATCH 076/164] fix imports

---
 .../modeling_time_series_transformer.py                     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 2d7842e02d472..12ad25bc4cc82 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -16,12 +16,12 @@
 
 import random
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Callable, Dict
+from typing import Callable, Dict, List, Optional, Tuple
 
 import torch
-from torch import nn
-from torch.distributions import TransformedDistribution, AffineTransform, Distribution, StudentT
 import torch.nn.functional as F
+from torch import nn
+from torch.distributions import AffineTransform, Distribution, StudentT, TransformedDistribution
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput

From 72f9a27d0f32dc11c96d3f65f786ddd654c34cba Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 6 Sep 2022 09:19:28 +0200
Subject: [PATCH 077/164] remove gluonts from configuration

---
 .../configuration_time_series_transformer.py  | 52 +++++++++++--------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index f26961027da01..a0a4f360d483f 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -15,8 +15,6 @@
 """ TimeSeriesTransformer model configuration """
 from typing import List, Optional
 
-from gluonts.time_feature import get_lags_for_frequency, time_features_from_frequency_str
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -53,11 +51,10 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             The size of the target variable which by default is 1 for univariate targets.
         scaling (`bool`, *optional* defaults to `True`):
             Whether to scale the input targets.
-        freq (`str`, *optional*):
-            The frequency of the input time series. If `None`, the `lags_seq` and `num_time_features` are set at
-            the finest temporal resolution of 1 second.
-        lags_seq (`list` of `int`, *optional*):
-            The lags of the input time series. If `None`, the `freq` is used to determine the lags.
+        lags_seq (`list` of `int`):
+            The lags of the input time series as covariates often dictated by the frequency. Default is [1, 2, 3, 4, 5, 6, 7].
+        num_time_features (`int`, *optional* defaults to 0):
+            The number of time features in the input time series.
         num_feat_dynamic_real (`int`, *optional* defaults to `0`):
             The number of dynamic real valued features.
         num_feat_static_cat (`int`, *optional* defaults to `0`):
@@ -72,17 +69,33 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             Number of encoder layers.
         decoder_layers (`int`, *optional*, defaults to 2):
             Number of decoder layers.
-        num_attention_heads (`int`, *optional*, defaults to 2):
-            Number of attention heads for each attention layer in the Transformer encoder and decoder.
-        ffn_dim (`int`, *optional*, defaults to 32):
-            Dimension of the "intermediate" (often named feed-forward) layer in encoder and decoder.
+        encoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in encoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
         activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and decoder. If string,
             `"gelu"` and `"relu"` are supported.
         dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the encoder, and decoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each encoder layer.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each decoder layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability used between the two layers of the feed-forward networks.
         num_parallel_samples (`int`, *optional*, defaults to 100):
             The number of samples to generate in parallel for each time step of inference.
+        init_std (`float`, *optional*, defaults to 0.01):
+            The standard deviation of the truncated normal weight initialization distribution.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
 
         Example:
 
@@ -103,22 +116,22 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
     def __init__(
         self,
         input_size: int = 1,
-        freq: Optional[str] = None,
         prediction_length: Optional[int] = None,
         context_length: Optional[int] = None,
         distribution_output: str = "student_t",
         loss: str = "nll",
-        lags_seq: Optional[List[int]] = None,
+        lags_seq: List[int] = [1, 2, 3, 4, 5, 6, 7],
         scaling: bool = True,
         num_feat_dynamic_real: int = 0,
         num_feat_static_cat: int = 0,
         num_feat_static_real: int = 0,
+        num_time_features: int = 0,
         cardinality: Optional[List[int]] = None,
         embedding_dimension: Optional[List[int]] = None,
         encoder_ffn_dim: int = 32,
         decoder_ffn_dim: int = 32,
-        decoder_attention_heads: int = 2,
         encoder_attention_heads: int = 2,
+        decoder_attention_heads: int = 2,
         encoder_layers: int = 2,
         decoder_layers: int = 2,
         is_encoder_decoder: bool = True,
@@ -133,17 +146,14 @@ def __init__(
         use_cache=True,
         **kwargs
     ):
-        # time series specific parameters
+        # time series specific configuration
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
-        self.freq = freq or "1S"
         self.distribution_output = distribution_output
         self.loss = loss
         self.input_size = input_size
-        self.num_time_features = (
-            len(time_features_from_frequency_str(freq_str=self.freq)) + 1
-        )  # +1 for the Age feature
-        self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=self.freq)
+        self.num_time_features = num_time_features
+        self.lags_seq = lags_seq
         self.scaling = scaling
         self.num_feat_dynamic_real = num_feat_dynamic_real
         self.num_feat_static_real = num_feat_static_real
@@ -152,7 +162,7 @@ def __init__(
         self.embedding_dimension = embedding_dimension or [min(50, (cat + 1) // 2) for cat in self.cardinality]
         self.num_parallel_samples = num_parallel_samples
 
-        # Transformer architecture parameters
+        # Transformer architecture configuration
         self.encoder_attention_heads = encoder_attention_heads
         self.decoder_attention_heads = decoder_attention_heads
         self.encoder_ffn_dim = encoder_ffn_dim

From 949b82a57a405dec7e91dd4dc08336804b233dfc Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 6 Sep 2022 09:25:54 +0200
Subject: [PATCH 078/164] fix docs

---
 .../configuration_time_series_transformer.py  | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index a0a4f360d483f..76783b31a9f00 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -46,13 +46,13 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
         distribution_output (`string`, *optional* defaults to `student_t`):
             The distribution emission head for the model.
         loss (`string`, *optional* defaults to `nll`):
-            The loss function for the model with corresponding to the `distribution_output` head.
+            The loss function for the model corresponding to the `distribution_output` head. For parametric distributions it is negative log likelihood.
         input_size (`int`, *optional* defaults to 1):
             The size of the target variable which by default is 1 for univariate targets.
         scaling (`bool`, *optional* defaults to `True`):
             Whether to scale the input targets.
-        lags_seq (`list` of `int`):
-            The lags of the input time series as covariates often dictated by the frequency. Default is [1, 2, 3, 4, 5, 6, 7].
+        lags_seq (`list` of `int`  *optional* defaults to `[1, 2, 3, 4, 5, 6, 7]`):
+            The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4, 5, 6, 7]`.
         num_time_features (`int`, *optional* defaults to 0):
             The number of time features in the input time series.
         num_feat_dynamic_real (`int`, *optional* defaults to `0`):
@@ -65,34 +65,34 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             The cardinality of the categorical features. Cannot be `None` if `num_feat_static_cat` is `> 0`.
         embedding_dimension (`list` of `int`, *optional*):
             The dimension of the embedding for the categorical features. Cannot be `None` if `num_feat_static_cat` is `> 0`.
-        encoder_layers (`int`, *optional*, defaults to 2):
+        encoder_layers (`int`, *optional*, defaults to `2`):
             Number of encoder layers.
-        decoder_layers (`int`, *optional*, defaults to 2):
+        decoder_layers (`int`, *optional*, defaults to `2`):
             Number of decoder layers.
-        encoder_attention_heads (`int`, *optional*, defaults to 2):
+        encoder_attention_heads (`int`, *optional*, defaults to `2`):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (`int`, *optional*, defaults to 2):
+        decoder_attention_heads (`int`, *optional*, defaults to `2`):
             Number of attention heads for each attention layer in the Transformer decoder.
-        encoder_ffn_dim (`int`, *optional*, defaults to 32):
+        encoder_ffn_dim (`int`, *optional*, defaults to `32`):
             Dimension of the "intermediate" (often named feed-forward) layer in encoder.
-        decoder_ffn_dim (`int`, *optional*, defaults to 32):
+        decoder_ffn_dim (`int`, *optional*, defaults to `32`):
             Dimension of the "intermediate" (often named feed-forward) layer in decoder.
         activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and decoder. If string,
             `"gelu"` and `"relu"` are supported.
-        dropout (`float`, *optional*, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to `0.1`):
             The dropout probability for all fully connected layers in the encoder, and decoder.
-        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+        encoder_layerdrop (`float`, *optional*, defaults to `0.1`):
             The dropout probability for the attention and fully connected layers for each encoder layer.
-        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
+        decoder_layerdrop (`float`, *optional*, defaults to `0.1`):
             The dropout probability for the attention and fully connected layers for each decoder layer.
-        attention_dropout (`float`, *optional*, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to `0.1`):
             The dropout probability for the attention probabilities.
-        activation_dropout (`float`, *optional*, defaults to 0.1):
+        activation_dropout (`float`, *optional*, defaults to `0.1`):
             The dropout probability used between the two layers of the feed-forward networks.
-        num_parallel_samples (`int`, *optional*, defaults to 100):
+        num_parallel_samples (`int`, *optional*, defaults to `100`):
             The number of samples to generate in parallel for each time step of inference.
-        init_std (`float`, *optional*, defaults to 0.01):
+        init_std (`float`, *optional*, defaults to `0.02`):
             The standard deviation of the truncated normal weight initialization distribution.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.

From 2eaf4016dcaa50b6a826585e19fcf158882e53cb Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 8 Sep 2022 12:15:33 +0200
Subject: [PATCH 079/164] fixed typo

---
 .../modeling_time_series_transformer.py                       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 12ad25bc4cc82..de59e513239eb 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -205,9 +205,9 @@ def __init__(
         self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
 
     def forward(self, features: torch.Tensor) -> torch.Tensor:
-        if self._num_features > 1:
+        if self.num_features > 1:
             # we slice the last dimension, giving an array of length
-            # self._num_features with shape (N,T) or (N)
+            # self.num_features with shape (N,T) or (N)
             cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
         else:
             cat_feature_slices = [features]

From 924f1bd6fe194b5c38680dc2780b4b740e54417f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 8 Sep 2022 13:30:42 +0200
Subject: [PATCH 080/164] move utils to examples

---
 .../pytorch/time-series-prediction/utils_ts.py                 | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
 rename src/transformers/models/time_series_transformer/time_series_transformations.py => examples/pytorch/time-series-prediction/utils_ts.py (99%)

diff --git a/src/transformers/models/time_series_transformer/time_series_transformations.py b/examples/pytorch/time-series-prediction/utils_ts.py
similarity index 99%
rename from src/transformers/models/time_series_transformer/time_series_transformations.py
rename to examples/pytorch/time-series-prediction/utils_ts.py
index 1c851c9928748..64a8b879b6612 100644
--- a/src/transformers/models/time_series_transformer/time_series_transformations.py
+++ b/examples/pytorch/time-series-prediction/utils_ts.py
@@ -18,6 +18,8 @@
 from typing import Iterable, List, Optional
 
 import pandas as pd
+from torch.utils.data import DataLoader
+
 from gluonts.dataset.field_names import FieldName
 from gluonts.itertools import Cyclic, IterableSlice, PseudoShuffled
 from gluonts.time_feature import time_features_from_frequency_str
@@ -39,7 +41,6 @@
     VstackFeatures,
 )
 from gluonts.transform.sampler import InstanceSampler
-from torch.utils.data import DataLoader
 
 
 @lru_cache(10_000)

From 4b955b403070c30830b8246dd77e7420ff16edd0 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 8 Sep 2022 13:31:26 +0200
Subject: [PATCH 081/164] add example requirements

---
 examples/pytorch/time-series-prediction/requirements.txt | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 examples/pytorch/time-series-prediction/requirements.txt

diff --git a/examples/pytorch/time-series-prediction/requirements.txt b/examples/pytorch/time-series-prediction/requirements.txt
new file mode 100644
index 0000000000000..eeac4319dcc22
--- /dev/null
+++ b/examples/pytorch/time-series-prediction/requirements.txt
@@ -0,0 +1,3 @@
+accelerate
+datasets
+gluonts[torch]

From 5dfa7c7284fbfba4e25f4cc46735b4d7fee9480f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 8 Sep 2022 14:30:29 +0200
Subject: [PATCH 082/164] config has no freq

---
 .../pytorch/time-series-prediction/utils_ts.py    | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/examples/pytorch/time-series-prediction/utils_ts.py b/examples/pytorch/time-series-prediction/utils_ts.py
index 64a8b879b6612..382dca41e84d4 100644
--- a/examples/pytorch/time-series-prediction/utils_ts.py
+++ b/examples/pytorch/time-series-prediction/utils_ts.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Transformations for Time Series Transformers. """
+""" Transformations Utilities for Time Series Transformers. """
 
 from functools import lru_cache
 from typing import Iterable, List, Optional
@@ -53,7 +53,7 @@ def transform_start_field(batch, freq):
     return batch
 
 
-def create_transformation(config) -> Transformation:
+def create_transformation(freq, config) -> Transformation:
     remove_field_names = []
     if config.num_feat_static_real == 0:
         remove_field_names.append(FieldName.FEAT_STATIC_REAL)
@@ -90,7 +90,7 @@ def create_transformation(config) -> Transformation:
                 start_field=FieldName.START,
                 target_field=FieldName.TARGET,
                 output_field=FieldName.FEAT_TIME,
-                time_features=time_features_from_frequency_str(config.freq),
+                time_features=time_features_from_frequency_str(freq),
                 pred_length=config.prediction_length,
             ),
             AddAgeFeature(
@@ -138,6 +138,7 @@ def create_instance_splitter(
 
 
 def create_training_data_loader(
+    freq,
     config,
     data,
     batch_size: int,
@@ -159,7 +160,7 @@ def create_training_data_loader(
         "future_" + FieldName.OBSERVED_VALUES,
     ]
 
-    transformation = create_transformation(config)
+    transformation = create_transformation(freq, config)
     transformed_data = transformation.apply(data, is_train=True)
 
     instance_splitter = create_instance_splitter(config, "train") + SelectFields(TRAINING_INPUT_NAMES)
@@ -187,6 +188,7 @@ def create_training_data_loader(
 
 
 def create_validation_data_loader(
+    freq,
     config,
     data,
     batch_size,
@@ -205,7 +207,7 @@ def create_validation_data_loader(
         "future_" + FieldName.TARGET,
         "future_" + FieldName.OBSERVED_VALUES,
     ]
-    transformation = create_transformation(config)
+    transformation = create_transformation(freq, config)
     transformed_data = transformation.apply(data, is_train=True)
 
     instance_splitter = create_instance_splitter(config, "validation") + SelectFields(TRAINING_INPUT_NAMES)
@@ -219,6 +221,7 @@ def create_validation_data_loader(
 
 
 def create_test_data_loader(
+    freq,
     config,
     data,
     batch_size,
@@ -232,7 +235,7 @@ def create_test_data_loader(
         "past_" + FieldName.OBSERVED_VALUES,
         "future_" + FieldName.FEAT_TIME,
     ]
-    transformation = create_transformation(config)
+    transformation = create_transformation(freq, config)
     transformed_data = transformation.apply(data, is_train=False)
     instance_splitter = create_instance_splitter(config, "test") + SelectFields(PREDICTION_INPUT_NAMES)
     test_instances = instance_splitter.apply(transformed_data, is_tran=False)

From 7e02a57b3a72e17e80ee37bedcbfab1860ed37ce Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 12 Sep 2022 14:40:34 +0200
Subject: [PATCH 083/164] initial run_ts_no_trainer

---
 .../run_ts_no_trainer.py                      | 194 ++++++++++++++++++
 1 file changed, 194 insertions(+)
 create mode 100644 examples/pytorch/time-series-prediction/run_ts_no_trainer.py

diff --git a/examples/pytorch/time-series-prediction/run_ts_no_trainer.py b/examples/pytorch/time-series-prediction/run_ts_no_trainer.py
new file mode 100644
index 0000000000000..fc9fad5b83a8b
--- /dev/null
+++ b/examples/pytorch/time-series-prediction/run_ts_no_trainer.py
@@ -0,0 +1,194 @@
+import argparse
+import json
+import logging
+import math
+import os
+import random
+from itertools import chain
+from pathlib import Path
+
+import datasets
+import torch
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+
+import transformers
+from accelerate import Accelerator, DistributedType
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from huggingface_hub import Repository
+from transformers import SchedulerType
+from transformers.utils import get_full_repo_name, send_example_telemetry
+from transformers.utils.versions import require_version
+
+
+logger = get_logger(__name__)
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/time-series-prediction/requirements.txt")
+
+# Parsing input arguments
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Finetune a transformers model on a probabilistic time series forecasting task"
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default="monash_tsf",
+        help="The name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default="tourism_monthly",
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=False,
+    )
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--prediction_length",
+        type=int,
+        default=None,
+        help=("The prediction horizon of the time series forecasting task."),
+    )
+    parser.add_argument(
+        "--freq",
+        type=str,
+        default=None,
+        help="The frequency of the time series.",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument(
+        "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
+    )
+    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--with_tracking",
+        action="store_true",
+        help="Whether to enable experiment trackers for logging.",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="all",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            "Only applicable when `--with_tracking` is passed."
+        ),
+    )
+    args = parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_ts_no_trainer", args)
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
+    # in the environment
+    accelerator = (
+        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
+    )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
+    # Get the datasets
+    raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+
+    # config
+    config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
+
+    # model
+    model = TimeSeriesTransformerForPrediction(config)

From 1b166abd7e6c67ff7bf638adf1dfb06e2a9f1bd3 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 22 Sep 2022 16:38:20 +0200
Subject: [PATCH 084/164] remove from ignore

---
 utils/check_repo.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/utils/check_repo.py b/utils/check_repo.py
index c07312e0678e1..9905bb00544b7 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -46,9 +46,6 @@
 # Being in this list is an exception and should **not** be the rule.
 IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
     # models to ignore for not tested
-    "TimeSeriesTransformerEncoder",  # Building part of bigger (tested) model.
-    "TimeSeriesTransformerDecoder",  # Building part of bigger (tested) model.
-    "TimeSeriesTransformerDecoderWrapper", # Building part of bigger (tested) model.
     "OPTDecoder",  # Building part of bigger (tested) model.
     "DecisionTransformerGPT2Model",  # Building part of bigger (tested) model.
     "SegformerDecodeHead",  # Building part of bigger (tested) model.
@@ -127,9 +124,6 @@
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     # models to ignore for model xxx mapping
-    "TimeSeriesTransformerEncoder",
-    "TimeSeriesTransformerDecoder",
-    "TimeSeriesTransformerDecoderWrapper",
     "DPTForDepthEstimation",
     "DecisionTransformerGPT2Model",
     "GLPNForDepthEstimation",

From 727982a3358e0639cd22cb34a5ab59019b401780 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 22 Sep 2022 16:55:39 +0200
Subject: [PATCH 085/164] fix output_attentions and removed unsued
 getters/setters

---
 .../modeling_time_series_transformer.py               | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index de59e513239eb..07cc7660ac5a2 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -51,7 +51,6 @@
 
 class AffineTransformed(TransformedDistribution):
     def __init__(self, base_distribution: Distribution, loc=None, scale=None):
-
         self.scale = 1.0 if scale is None else scale
         self.loc = 0.0 if loc is None else loc
 
@@ -1494,14 +1493,6 @@ def enc_dec_outputs(self, transformer_inputs):
         )
         return encoder_outputs, decoder_outputs
 
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, value):
-        self.shared = value
-        self.encoder.embed_tokens = self.shared
-        self.decoder.embed_tokens = self.shared
-
     def get_encoder(self):
         return self.encoder
 
@@ -1526,7 +1517,7 @@ def forward(
         encoder_outputs: Optional[List[torch.FloatTensor]] = None,
         output_hidden_states: bool = False,
         use_cache: bool = False,
-        output_attentions: bool = False,
+        output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ):
         transformer_inputs, scale, static_feat = self.create_network_inputs(

From 84c77ad7b52b183654876d6bede40b1444526a97 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 22 Sep 2022 16:55:54 +0200
Subject: [PATCH 086/164] removed unsed tests

---
 .../test_modeling_time_series_transformer.py  | 437 +++---------------
 1 file changed, 69 insertions(+), 368 deletions(-)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index feb7d9801b071..348c39c36d609 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -20,12 +20,11 @@
 import unittest
 
 from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
 from transformers.utils import cached_property
-from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...generation.test_generation_utils import GenerationTesterMixin
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 
 
 if is_torch_available():
@@ -42,35 +41,18 @@
     )
 
 
-def prepare_time_series_transformer_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-    }
-
-
 @require_torch
 class TimeSeriesTransformerModelTester:
     def __init__(
         self,
         parent,
         batch_size=13,
-        seq_length=7,
+        prediction_length=7,
+        context_length=14,
+        cardinality=19,
+        embedding_dimension=5,
+        num_time_features=4,
         is_training=True,
-        use_labels=False,
-        vocab_size=99,
         hidden_size=16,
         num_hidden_layers=2,
         num_attention_heads=4,
@@ -78,17 +60,17 @@ def __init__(
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
+        lags_seq=[1, 2, 3, 4, 5],
     ):
         self.parent = parent
         self.batch_size = batch_size
-        self.seq_length = seq_length
+        self.prediction_length = prediction_length
+        self.context_length = context_length
+        self.cardinality = cardinality
+        self.num_time_features = num_time_features
+        self.lags_seq = lags_seq
+        self.embedding_dimension = embedding_dimension
         self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
@@ -96,23 +78,26 @@ def __init__(
         self.hidden_act = hidden_act
         self.hidden_dropout_prob = hidden_dropout_prob
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
+
+        self.encoder_seq_length = context_length
+        self.key_length = context_length
 
     def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
+        _past_length = self.context_length + max(self.lags_seq)
 
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        feat_static_cat = ids_tensor([self.batch_size, 1], self.cardinality)
+        feat_static_real = floats_tensor([self.batch_size, 1])
+
+        past_time_feat = floats_tensor([self.batch_size, _past_length, self.num_time_features])
+        past_target = floats_tensor([self.batch_size, _past_length])
+        past_observed_values = floats_tensor([self.batch_size, _past_length])
+
+        # decoder inputs
+        future_time_feat = floats_tensor([self.batch_size, self.prediction_length, self.num_time_features])
+        future_target = floats_tensor([self.batch_size, self.prediction_length])
+        future_observed_values = floats_tensor([self.batch_size, self.prediction_length])
 
         config = TimeSeriesTransformerConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
             decoder_layers=self.num_hidden_layers,
             encoder_attention_heads=self.num_attention_heads,
@@ -121,12 +106,24 @@ def prepare_config_and_inputs(self):
             decoder_ffn_dim=self.intermediate_size,
             dropout=self.hidden_dropout_prob,
             attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
+            prediction_length=self.prediction_length,
+            context_length=self.context_length,
+            lags_seq=self.lags_seq,
+            num_time_features=self.num_time_features,
+            num_feat_static_cat=1,
+            cardinality=[self.cardinality],
+            embedding_dimension=[self.embedding_dimension],
         )
-        inputs_dict = prepare_time_series_transformer_inputs_dict(config, input_ids, decoder_input_ids)
+
+        inputs_dict = {
+            "feat_static_cat": feat_static_cat,
+            "feat_static_real": feat_static_real,
+            "past_time_feat": past_time_feat,
+            "past_target": past_target,
+            "future_time_feat": future_time_feat,
+            "past_observed_values": past_observed_values,
+            "future_target": future_target,
+        }
         return config, inputs_dict
 
     def prepare_config_and_inputs_for_common(self):
@@ -178,9 +175,11 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
             encoder.save_pretrained(tmpdirname)
             encoder = TimeSeriesTransformerEncoder.from_pretrained(tmpdirname).to(torch_device)
 
-        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
-            0
-        ]
+        transformer_inputs, _, _ = model.create_network_inputs(**inputs_dict)
+        enc_input = transformer_inputs[:, : config.context_length, ...]
+        dec_input = transformer_inputs[:, config.context_length :, ...]
+
+        encoder_last_hidden_state_2 = encoder(input_ids=enc_input)[0]
 
         self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
 
@@ -190,33 +189,29 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
             decoder = TimeSeriesTransformerDecoder.from_pretrained(tmpdirname).to(torch_device)
 
         last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
+            input_ids=dec_input,
             encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=inputs_dict["attention_mask"],
         )[0]
 
         self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
 
 
 @require_torch
-class TimeSeriesTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class TimeSeriesTransformerModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TimeSeriesTransformerModel,
             TimeSeriesTransformerForPrediction,
-            TimeSeriesTransformerForConditionalGeneration,
-            TimeSeriesTransformerForSequenceClassification,
-            TimeSeriesTransformerForQuestionAnswering,
         )
         if is_torch_available()
         else ()
     )
-    all_generative_model_classes = (TimeSeriesTransformerForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (TimeSeriesTransformerForPrediction,) if is_torch_available() else ()
     is_encoder_decoder = True
     test_pruning = False
     test_head_masking = False
     test_missing_keys = False
+    test_torchscript = False
 
     def setUp(self):
         self.model_tester = TimeSeriesTransformerModelTester(self)
@@ -235,57 +230,19 @@ def test_save_load_strict(self):
                 model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
             self.assertEqual(info["missing_keys"], [])
 
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
     def test_encoder_decoder_model_standalone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
 
-    # TimeSeriesTransformerForSequenceClassification does not support inputs_embeds
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (
-            TimeSeriesTransformerModel,
-            TimeSeriesTransformerForConditionalGeneration,
-            TimeSeriesTransformerForQuestionAnswering,
-        ):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with torch.no_grad():
-                model(**inputs)[0]
-
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        model = TimeSeriesTransformerForConditionalGeneration(config).eval().to(torch_device)
-        if torch_device == "cuda":
-            model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+    # def test_generate_fp16(self):
+    #     config, input_dict = self.model_tester.prepare_config_and_inputs()
+    #     input_ids = input_dict["input_ids"]
+    #     attention_mask = input_ids.ne(1).to(torch_device)
+    #     model = TimeSeriesTransformerForPrediction(config).eval().to(torch_device)
+    #     if torch_device == "cuda":
+    #         model.half()
+    #     model.generate(input_ids, attention_mask=attention_mask)
+    #     model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
 
 
 def assert_tensors_close(a, b, atol=1e-12, prefix=""):
@@ -315,13 +272,11 @@ def _long_tensor(tok_lst):
 
 
 @require_torch
-@require_sentencepiece
-@require_tokenizers
 @slow
 class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_tokenizer(self):
-        return TimeSeriesTransformerTokenizer.from_pretrained("huggingface/tst-ett")
+    # @cached_property
+    # def default_tokenizer(self):
+    #     return TimeSeriesTransformerTokenizer.from_pretrained("huggingface/tst-ett")
 
     def test_inference_no_head(self):
         model = TimeSeriesTransformerModel.from_pretrained("huggingface/tst-ett").to(torch_device)
@@ -339,7 +294,7 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
 
     def test_inference_head(self):
-        model = TimeSeriesTransformerForConditionalGeneration.from_pretrained("huggingface/tst-ett").to(torch_device)
+        model = TimeSeriesTransformerForPrediction.from_pretrained("huggingface/tst-ett").to(torch_device)
 
         # change to intended input
         input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
@@ -356,258 +311,4 @@ def test_inference_head(self):
         self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
 
     def test_seq_to_seq_generation(self):
-        hf = TimeSeriesTransformerForConditionalGeneration.from_pretrained("huggingface/tst-ett").to(torch_device)
-        tok = TimeSeriesTransformerTokenizer.from_pretrained("huggingface/tst-ett")
-
-        batch_input = [
-            # string 1,
-            # string 2,
-            # string 3,
-            # string 4,
-        ]
-
-        # The below article tests that we don't add any hypotheses outside of the top n_beams
-        dct = tok.batch_encode_plus(
-            batch_input,
-            max_length=512,
-            padding="max_length",
-            truncation_strategy="only_first",
-            truncation=True,
-            return_tensors="pt",
-        )
-
-        hypotheses_batch = hf.generate(
-            input_ids=dct["input_ids"].to(torch_device),
-            attention_mask=dct["attention_mask"].to(torch_device),
-            num_beams=2,
-        )
-
-        EXPECTED = [
-            # here expected 1,
-            # here expected 2,
-            # here expected 3,
-            # here expected 4,
-        ]
-
-        generated = tok.batch_decode(
-            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
-        )
-        assert generated == EXPECTED
-
-
-class TimeSeriesTransformerStandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        d_model=16,
-        decoder_seq_length=7,
-        is_training=True,
-        is_decoder=True,
-        use_attention_mask=True,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=2,
-        decoder_ffn_dim=32,
-        decoder_layers=4,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        max_position_embeddings=30,
-        is_encoder_decoder=False,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = decoder_layers
-        self.decoder_layers = decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_attention_heads = decoder_attention_heads
-        self.num_attention_heads = decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.is_encoder_decoder = is_encoder_decoder
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = TimeSeriesTransformerConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.d_model,
-            decoder_layers=self.decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            encoder_attention_heads=self.encoder_attention_heads,
-            decoder_attention_heads=self.decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            max_position_embeddings=self.max_position_embeddings,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        )
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        config.use_cache = True
-        model = TimeSeriesTransformerDecoder(config=config).to(torch_device).eval()
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-        # test that outputs are equal for slice
-        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        model = TimeSeriesTransformerDecoder(config=config).to(torch_device).eval()
-
-        # create attention mask
-        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = torch.cat(
-            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-        # test that outputs are equal for slice
-        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_torch
-class TimeSeriesTransformerStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TimeSeriesTransformerDecoder, TimeSeriesTransformerForCausalLM) if is_torch_available() else ()
-    )
-    all_generative_model_classes = (TimeSeriesTransformerForCausalLM,) if is_torch_available() else ()
-    test_pruning = False
-    is_encoder_decoder = False
-
-    def setUp(
-        self,
-    ):
-        self.model_tester = TimeSeriesTransformerStandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=TimeSeriesTransformerConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_attn_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
-        return
+        raise NotImplementedError("Generation not implemented yet")

From 06679dd03a9ff18b92bee270a42f4f19dd60d901 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 22 Sep 2022 19:02:39 +0200
Subject: [PATCH 087/164] add dec seq len

---
 .../time_series_transformer/modeling_time_series_transformer.py  | 1 -
 .../test_modeling_time_series_transformer.py                     | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 07cc7660ac5a2..2c4cf9aedf2f5 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1583,7 +1583,6 @@ def forward(
 class TimeSeriesTransformerForPrediction(TimeSeriesTransformerModel):
     def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
-        self.config = config
         self.model = TimeSeriesTransformerModel(config)
         if config.distribution_output == "student_t":
             self.distribution_output = StudentTOutput()
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 348c39c36d609..e9ee953f9c447 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -81,6 +81,7 @@ def __init__(
 
         self.encoder_seq_length = context_length
         self.key_length = context_length
+        self.decoder_seq_length = prediction_length
 
     def prepare_config_and_inputs(self):
         _past_length = self.context_length + max(self.lags_seq)

From 86562755fc0b5d0e159f6b3410660e0082dceafd Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 22 Sep 2022 22:10:12 +0200
Subject: [PATCH 088/164] add test_attention_outputs

---
 .../test_modeling_time_series_transformer.py  | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index e9ee953f9c447..4064bf1df24b6 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -245,6 +245,115 @@ def test_encoder_decoder_model_standalone(self):
     #     model.generate(input_ids, attention_mask=attention_mask)
     #     model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
 
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        chunk_length = getattr(self.model_tester, "chunk_length", None)
+        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
+            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+            out_len = len(outputs)
+
+            if self.is_encoder_decoder:
+                correct_outlen = 6
+
+                if "last_hidden_state" in outputs:
+                    correct_outlen += 1
+
+                self.assertEqual(out_len, correct_outlen)
+
+                # decoder attentions
+                decoder_attentions = outputs.decoder_attentions
+                self.assertIsInstance(decoder_attentions, (list, tuple))
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(decoder_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                )
+
+                # cross attentions
+                cross_attentions = outputs.cross_attentions
+                self.assertIsInstance(cross_attentions, (list, tuple))
+                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(cross_attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        decoder_seq_length,
+                        encoder_key_length,
+                    ],
+                )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+
 
 def assert_tensors_close(a, b, atol=1e-12, prefix=""):
     """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""

From f4465b7dcee6b49cf5d6a069a158a932386f680f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 22 Sep 2022 22:23:25 +0200
Subject: [PATCH 089/164] set has_text_modality=False

---
 .../test_modeling_time_series_transformer.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 4064bf1df24b6..4701d3015bb82 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -216,7 +216,7 @@ class TimeSeriesTransformerModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = TimeSeriesTransformerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TimeSeriesTransformerConfig)
+        self.config_tester = ConfigTester(self, config_class=TimeSeriesTransformerConfig, has_text_modality=False)
 
     def test_config(self):
         self.config_tester.run_common_tests()

From 93d02899b0f7592ec9d44bbc20c7796b332fd4ec Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 22 Sep 2022 22:24:56 +0200
Subject: [PATCH 090/164] add config attribute_map

---
 .../configuration_time_series_transformer.py     | 16 ++++++++++++++++
 .../modeling_time_series_transformer.py          | 12 ------------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 76783b31a9f00..840bfa5d7ce97 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -112,6 +112,11 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
     model_type = "time_series_transformer"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+        "num_hidden_layers": "encoder_layers",
+    }
 
     def __init__(
         self,
@@ -163,6 +168,7 @@ def __init__(
         self.num_parallel_samples = num_parallel_samples
 
         # Transformer architecture configuration
+        self.d_model = input_size * len(lags_seq) + self._number_of_features
         self.encoder_attention_heads = encoder_attention_heads
         self.decoder_attention_heads = decoder_attention_heads
         self.encoder_ffn_dim = encoder_ffn_dim
@@ -185,3 +191,13 @@ def __init__(
         self.use_cache = use_cache
 
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def _number_of_features(self) -> int:
+        return (
+            sum(self.embedding_dimension)
+            + self.num_feat_dynamic_real
+            + self.num_time_features
+            + max(1, self.num_feat_static_real)  # there is at least one dummy static real feature
+            + 1  # the log(scale)
+        )
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 2c4cf9aedf2f5..f08e0f79c2126 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1343,16 +1343,6 @@ def custom_forward(*inputs):
     TIME_SERIES_TRANSFORMER_START_DOCSTRING,
 )
 class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
-    @property
-    def _number_of_features(self) -> int:
-        return (
-            sum(self.config.embedding_dimension)
-            + self.config.num_feat_dynamic_real
-            + self.config.num_time_features
-            + max(1, self.config.num_feat_static_real)  # there is at least one dummy static real feature
-            + 1  # the log(scale)
-        )
-
     def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
 
@@ -1366,8 +1356,6 @@ def __init__(self, config: TimeSeriesTransformerConfig):
             embedding_dims=config.embedding_dimension,
         )
 
-        config.d_model = config.input_size * len(config.lags_seq) + self._number_of_features
-
         # transformer enc-decoder and mask initializer
         self.encoder = TimeSeriesTransformerEncoder(config)
         self.decoder = TimeSeriesTransformerDecoder(config)

From 19f188fc85158bf0ba30b0e8bc13bcd072816e84 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 23 Sep 2022 11:18:49 +0200
Subject: [PATCH 091/164] make style

---
 .../configuration_time_series_transformer.py  |  29 +--
 .../modeling_time_series_transformer.py       | 200 ++++++++----------
 2 files changed, 102 insertions(+), 127 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 840bfa5d7ce97..703992e6fd3b7 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TimeSeriesTransformer model configuration """
+""" TimeSeriesTransformer model configuration"""
 from typing import List, Optional
 
 from ...configuration_utils import PretrainedConfig
@@ -29,30 +29,32 @@
 
 class TimeSeriesTransformerConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`~TimeSeriesTransformerModel`].
-    It is used to instantiate a TimeSeriesTransformer model according to the specified arguments, defining the model
-    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-    the TimeSeriesTransformer [huggingface/tst-ett](https://huggingface.co/huggingface/tst-ett) architecture.
+    This is the configuration class to store the configuration of a [`~TimeSeriesTransformerModel`]. It is used to
+    instantiate a TimeSeriesTransformer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    TimeSeriesTransformer [huggingface/tst-ett](https://huggingface.co/huggingface/tst-ett) architecture.
 
-    Configuration objects inherit from  [`PretrainedConfig`] can be used to control the model outputs.
-    Read the documentation from  [`PretrainedConfig`]  for more information.
+    Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
 
     Args:
         prediction_length (`int`):
             The prediction horizon for the model.
         context_length (`int`, *optional*):
-            The context length for the encoder. If  `None`, the context length will be the same as the
+            The context length for the encoder. If `None`, the context length will be the same as the
             `prediction_length`.
         distribution_output (`string`, *optional* defaults to `student_t`):
             The distribution emission head for the model.
         loss (`string`, *optional* defaults to `nll`):
-            The loss function for the model corresponding to the `distribution_output` head. For parametric distributions it is negative log likelihood.
+            The loss function for the model corresponding to the `distribution_output` head. For parametric
+            distributions it is negative log likelihood.
         input_size (`int`, *optional* defaults to 1):
             The size of the target variable which by default is 1 for univariate targets.
         scaling (`bool`, *optional* defaults to `True`):
             Whether to scale the input targets.
         lags_seq (`list` of `int`  *optional* defaults to `[1, 2, 3, 4, 5, 6, 7]`):
-            The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4, 5, 6, 7]`.
+            The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
+            5, 6, 7]`.
         num_time_features (`int`, *optional* defaults to 0):
             The number of time features in the input time series.
         num_feat_dynamic_real (`int`, *optional* defaults to `0`):
@@ -64,7 +66,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
         cardinality (`list` of `int`, *optional*):
             The cardinality of the categorical features. Cannot be `None` if `num_feat_static_cat` is `> 0`.
         embedding_dimension (`list` of `int`, *optional*):
-            The dimension of the embedding for the categorical features. Cannot be `None` if `num_feat_static_cat` is `> 0`.
+            The dimension of the embedding for the categorical features. Cannot be `None` if `num_feat_static_cat` is
+            `> 0`.
         encoder_layers (`int`, *optional*, defaults to `2`):
             Number of encoder layers.
         decoder_layers (`int`, *optional*, defaults to `2`):
@@ -78,8 +81,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
         decoder_ffn_dim (`int`, *optional*, defaults to `32`):
             Dimension of the "intermediate" (often named feed-forward) layer in decoder.
         activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and decoder. If string,
-            `"gelu"` and `"relu"` are supported.
+            The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and
+            `"relu"` are supported.
         dropout (`float`, *optional*, defaults to `0.1`):
             The dropout probability for all fully connected layers in the encoder, and decoder.
         encoder_layerdrop (`float`, *optional*, defaults to `0.1`):
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index f08e0f79c2126..cf8b7678b5f62 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch TimeSeriesTransformer model. """
+""" PyTorch TimeSeriesTransformer model."""
 
 import random
 from dataclasses import dataclass
@@ -145,34 +145,31 @@ def distribution(
     @property
     def event_shape(self) -> Tuple:
         r"""
-        Shape of each individual event contemplated by the distributions
-        that this object constructs.
+        Shape of each individual event contemplated by the distributions that this object constructs.
         """
         raise NotImplementedError()
 
     @property
     def event_dim(self) -> int:
         r"""
-        Number of event dimensions, i.e., length of the `event_shape` tuple,
-        of the distributions that this object constructs.
+        Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object
+        constructs.
         """
         return len(self.event_shape)
 
     @property
     def value_in_support(self) -> float:
         r"""
-        A float that will have a valid numeric value when computing the
-        log-loss of the corresponding distribution. By default 0.0.
-        This value will be used when padding data series.
+        A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By
+        default 0.0. This value will be used when padding data series.
         """
         return 0.0
 
     def domain_map(self, *args: torch.Tensor):
         r"""
-        Converts arguments to the right shape and domain. The domain depends
-        on the type of distribution, while the correct shape is obtained by
-        reshaping the trailing axis in such a way that the returned tensors
-        define a distribution of the right event_shape.
+        Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
+        correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a
+        distribution of the right event_shape.
         """
         raise NotImplementedError()
 
@@ -222,18 +219,13 @@ def forward(self, features: torch.Tensor) -> torch.Tensor:
 
 class MeanScaler(nn.Module):
     """
-    Computes a scaling factor as the weighted average absolute value along
-    dimension ``dim``, and scales the data accordingly.
-    Parameters
-    ----------
-    dim
+    Computes a scaling factor as the weighted average absolute value along dimension ``dim``, and scales the data
+    accordingly. Parameters ---------- dim
         dimension along which to compute the scale
     keepdim
-        controls whether to retain dimension ``dim`` (of length 1) in the
-        scale tensor, or suppress it.
+        controls whether to retain dimension ``dim`` (of length 1) in the scale tensor, or suppress it.
     minimum_scale
-        default scale that is used for elements that are constantly zero
-        along dimension ``dim``.
+        default scale that is used for elements that are constantly zero along dimension ``dim``.
     """
 
     def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
@@ -277,15 +269,11 @@ def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tens
 
 class NOPScaler(nn.Module):
     """
-    Assigns a scaling factor equal to 1 along dimension ``dim``, and therefore
-    applies no scaling to the input data.
-    Parameters
-    ----------
-    dim
+    Assigns a scaling factor equal to 1 along dimension ``dim``, and therefore applies no scaling to the input data.
+    Parameters ---------- dim
         dimension along which to compute the scale
     keepdim
-        controls whether to retain dimension ``dim`` (of length 1) in the
-        scale tensor, or suppress it.
+        controls whether to retain dimension ``dim`` (of length 1) in the scale tensor, or suppress it.
     """
 
     def __init__(self, dim: int, keepdim: bool = False):
@@ -303,20 +291,14 @@ def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple
 
 def _weighted_average(x: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
     """
-    Computes the weighted average of a given tensor across a given dim, masking
-    values associated with weight zero,
-    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
-    Parameters
-    ----------
-    x
+    Computes the weighted average of a given tensor across a given dim, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`. Parameters ---------- x
         Input tensor, of which the average must be computed.
     weights
         Weights tensor, of the same shape as `x`.
     dim
         The dim along which to average `x`
-    Returns
-    -------
-    Tensor:
+    Returns ------- Tensor:
         The tensor with values averaged along the specified `dim`.
     """
     if weights is not None:
@@ -329,14 +311,9 @@ def _weighted_average(x: torch.Tensor, weights: Optional[torch.Tensor] = None, d
 
 class NegativeLogLikelihood:
     """
-    Compute the negative log likelihood loss.
-    Parameters
-    ----------
-    beta: float in range (0, 1)
-        beta parameter from the paper: "On the Pitfalls of Heteroscedastic
-        Uncertainty Estimation with Probabilistic Neural Networks" by
-        Seitzer et al. 2022
-        https://openreview.net/forum?id=aPOpXlnV1T
+    Compute the negative log likelihood loss. Parameters ---------- beta: float in range (0, 1)
+        beta parameter from the paper: "On the Pitfalls of Heteroscedastic Uncertainty Estimation with Probabilistic
+        Neural Networks" by Seitzer et al. 2022 https://openreview.net/forum?id=aPOpXlnV1T
     """
 
     beta: float = 0.0
@@ -716,7 +693,8 @@ def forward(
             hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
             attention_mask (`torch.FloatTensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
-            encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -834,20 +812,19 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 
 TIME_SERIES_TRANSFORMER_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
-    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
-    pruning heads etc.)
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
 
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    general usage and behavior.
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
 
     Parameters:
         config ([`~TimeSeriesTransformerConfig`]):
-            Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
-            weights.
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
 TIME_SERIES_TRANSFORMER_PREDICTION_EXAMPLE = r"""
@@ -856,13 +833,13 @@ def _set_gradient_checkpointing(self, module, value=False):
     ```python
     >>> from transformers import TimeSeriesTransformerForPrediction
 
-    >>> model = TimeSeriesTransformerForConditionalGeneration.from_pretrained('huggingface/tst-ett')
+    >>> model = TimeSeriesTransformerForConditionalGeneration.from_pretrained("huggingface/tst-ett")
 
     >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
-    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")
 
     >>> # Generate Summary
-    >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5)
+    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5)
     >>> print(tokenizer.decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
     ```
 """
@@ -873,9 +850,8 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`~TimeSeriesTransformerTokenizer`]. See
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
-            details.
+            Indices can be obtained using [`~TimeSeriesTransformerTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -889,12 +865,12 @@ def _set_gradient_checkpointing(self, module, value=False):
             Provide for translation and summarization training. By default, the model will create this tensor by
             shifting the `input_ids` to the right, following the paper.
         decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
-            also be used by default.
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
 
-            If you want to change padding behavior, you should read [`modeling_time_series_transformer._prepare_decoder_attention_mask`] and
-            modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
+            If you want to change padding behavior, you should read
+            [`modeling_time_series_transformer._prepare_decoder_attention_mask`] and modify to your needs. See diagram
+            1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
         head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
             Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
 
@@ -914,33 +890,35 @@ def _set_gradient_checkpointing(self, module, value=False):
             - 0 indicates the head is **masked**.
 
         encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
-            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
-            `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
-            *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
-            cross-attention of the decoder.
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
-            of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-            shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
 
             Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
 
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
-            instead of all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
+            shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids`
+            you can choose to directly pass an embedded representation. This is useful if you want more control over
+            how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup
+            matrix.
         decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
-            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds`
-            have to be input (see `past_key_values`). This is useful if you want more control over how to convert
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
             `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
 
-            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds`
-            takes the value of `inputs_embeds`.
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
         use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up
-            decoding (see `past_key_values`).
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -958,9 +936,8 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`ProphetNetTokenizer`]. See
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
-            details.
+            Indices can be obtained using [`ProphetNetTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1031,9 +1008,9 @@ def forward(
                 - 0 indicates the head is **masked**.
 
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
-                representation. This is useful if you want more control over how to convert `input_ids` indices
-                into associated vectors than the model's internal embedding lookup matrix.
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -1115,7 +1092,8 @@ def custom_forward(*inputs):
 
 class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel):
     """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TimeSeriesTransformerDecoderLayer`]
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
+    [`TimeSeriesTransformerDecoderLayer`]
 
     Args:
         config: TimeSeriesTransformerConfig
@@ -1199,19 +1177,20 @@ def forward(
                 - 0 indicates the head is **masked**.
 
             past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
-                tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
-                tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
 
                 Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
-                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
-                decoding.
-
-                If `past_key_values` are used, the user can optionally input only the last
-                `decoder_input_ids` (those that don't have their past key value states given to this model) of
-                shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
-                sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
-                into associated vectors than the model's internal embedding lookup matrix.
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor`
+                of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -1371,22 +1350,15 @@ def get_lagged_subsequences(
         self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
     ) -> torch.Tensor:
         """
-        Returns lagged subsequences of a given sequence.
-        Parameters
-        ----------
-        sequence : Tensor
-            the sequence from which lagged subsequences should be extracted.
-            Shape: (N, T, C).
+        Returns lagged subsequences of a given sequence. Parameters ---------- sequence : Tensor
+            the sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
         subsequences_length : int
             length of the subsequences to be extracted.
         shift: int
             shift the lags by this amount back.
-        Returns
-        --------
-        lagged : Tensor
-            a tensor of shape (N, S, C, I), where S = subsequences_length and
-            I = len(indices), containing lagged subsequences. Specifically,
-            lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
+        Returns -------- lagged : Tensor
+            a tensor of shape (N, S, C, I), where S = subsequences_length and I = len(indices), containing lagged
+            subsequences. Specifically, lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
         """
         sequence_length = sequence.shape[1]
         indices = [lag - shift for lag in self.config.lags_seq]

From 444ba8992a513a384b52fc91e466c887fdcd6d72 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 23 Sep 2022 11:33:05 +0200
Subject: [PATCH 092/164] make fix-copies

---
 .../modeling_time_series_transformer.py       | 59 +++++++++++--------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index cf8b7678b5f62..977fad7f407f9 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -440,6 +440,7 @@ class SampleTSPredictionOutput(ModelOutput):
     sequences: torch.FloatTensor = None
 
 
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->TimeSeriesTransformer
 class TimeSeriesTransformerAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -456,10 +457,12 @@ def __init__(
         self.num_heads = num_heads
         self.dropout = dropout
         self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == self.embed_dim, (
-            f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-            f" {num_heads})."
-        )
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
         self.scaling = self.head_dim**-0.5
         self.is_decoder = is_decoder
 
@@ -485,7 +488,8 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -555,7 +559,7 @@ def forward(
         if output_attentions:
             # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
+            # In order to do so, attn_weights have to be reshaped
             # twice and have to be reused in the following
             attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
             attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
@@ -574,13 +578,17 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
         return attn_output, attn_weights_reshaped, past_key_value
 
 
+# Copied from transformers.models.bart.modeling_bart.BartEncoderLayer with Bart->TimeSeriesTransformer
 class TimeSeriesTransformerEncoderLayer(nn.Module):
     def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__()
@@ -600,18 +608,18 @@ def __init__(self, config: TimeSeriesTransformerConfig):
 
     def forward(
         self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        layer_head_mask: torch.Tensor,
-        output_attentions: bool = False,
-    ):
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        layer_head_mask: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
-                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
-                *(config.encoder_attention_heads,)*.
+                `(encoder_attention_heads,)`.
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -649,6 +657,7 @@ def forward(
         return outputs
 
 
+# Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->TimeSeriesTransformer
 class TimeSeriesTransformerDecoderLayer(nn.Module):
     def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__()
@@ -683,24 +692,24 @@ def forward(
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         layer_head_mask: Optional[torch.Tensor] = None,
-        cross_layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = True,
-    ):
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
-                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
-                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
-                *(encoder_attention_heads,)*.
-            cross_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
-                size *(decoder_attention_heads,)*.
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
             past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
@@ -735,7 +744,7 @@ def forward(
                 hidden_states=hidden_states,
                 key_value_states=encoder_hidden_states,
                 attention_mask=encoder_attention_mask,
-                layer_head_mask=cross_layer_head_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
                 past_key_value=cross_attn_past_key_value,
                 output_attentions=output_attentions,
             )

From d639388ffcbfd6fcb320c128ef4cc2b4cc66d3e1 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 23 Sep 2022 12:11:43 +0200
Subject: [PATCH 093/164] add encoder_outputs to
 TimeSeriesTransformerForPrediction forward

---
 .../time_series_transformer/modeling_time_series_transformer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 977fad7f407f9..7b682fa0d7911 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1587,6 +1587,7 @@ def forward(
         future_time_feat: Optional[torch.Tensor] = None,
         future_target: Optional[torch.Tensor] = None,
         future_observed_values: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1604,6 +1605,7 @@ def forward(
             past_observed_values=past_observed_values,
             future_time_feat=future_time_feat,
             future_target=future_target,
+            encoder_outputs=encoder_outputs,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,

From 8cbe681d66e7fd861451eaa5f66c5d582620a6c0 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Fri, 23 Sep 2022 10:23:30 +0000
Subject: [PATCH 094/164] Improve docs, add model to README

---
 README.md                                     |  2 +
 README_ko.md                                  |  2 +
 README_zh-hans.md                             |  2 +
 README_zh-hant.md                             |  2 +
 docs/source/en/_toctree.yml                   |  5 ++
 docs/source/en/index.mdx                      |  3 +
 .../en/model_doc/time_series_transformer.mdx  | 20 +++--
 .../run_ts_no_trainer.py                      | 32 ++++---
 .../models/auto/configuration_auto.py         |  6 +-
 src/transformers/models/auto/modeling_auto.py |  2 +-
 .../modeling_time_series_transformer.py       | 84 +++++++++----------
 src/transformers/utils/dummy_pt_objects.py    | 24 ++++++
 .../test_modeling_time_series_transformer.py  | 35 +-------
 13 files changed, 122 insertions(+), 97 deletions(-)

diff --git a/README.md b/README.md
index 5e17e33b204cc..7395c30e92935 100644
--- a/README.md
+++ b/README.md
@@ -373,6 +373,8 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Time Series Transformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[TimeSeriesTransformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
diff --git a/README_ko.md b/README_ko.md
index f53075ff5fe6f..790ff54d83fdf 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -323,6 +323,8 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Time Series Transformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from HuggingFace). 
+1. **[TimeSeriesTransformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 2843a8eb29a08..9c6c71e0129c8 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -347,6 +347,8 @@ conda install -c huggingface transformers
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。
+1. **[Time Series Transformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from HuggingFace). 
+1. **[TimeSeriesTransformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 8f74b97e98549..25bd8d6d3a9e5 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -359,6 +359,8 @@ conda install -c huggingface transformers
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Time Series Transformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from HuggingFace). 
+1. **[TimeSeriesTransformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 223c5d2a6998f..a783f5b7bb8f2 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -490,6 +490,11 @@
       - local: model_doc/trajectory_transformer
         title: Trajectory Transformer
       title: Reinforcement learning models
+    - isExpanded: false
+      sections:
+      - local: model_doc/time_series_trasnformer
+        title: Time Series Transformer
+      title: Time series models
     title: Models
   - sections:
     - local: internal/modeling_utils
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index e6a3d912b2743..25c0265b10e35 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -163,6 +163,8 @@ The documentation is organized into five sections:
 1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Time Series Transformer](model_doc/time_series_transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[TimeSeriesTransformer](model_doc/time_series_transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
@@ -306,6 +308,7 @@ Flax), PyTorch, and/or TensorFlow.
 |     Swin Transformer V2     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|   Time Series Transformer   |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |   Trajectory Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/time_series_transformer.mdx b/docs/source/en/model_doc/time_series_transformer.mdx
index cd309be74fdb9..c6bb60823a16b 100644
--- a/docs/source/en/model_doc/time_series_transformer.mdx
+++ b/docs/source/en/model_doc/time_series_transformer.mdx
@@ -10,21 +10,20 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# TimeSeriesTransformer
+# Time Series Transformer
 
 ## Overview
 
-The TimeSeriesTransformer model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>)  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
-
-The abstract from the paper is the following:
-
-*<INSERT PAPER ABSTRACT HERE>*
+The Time Series Transformer model is a vanilla encoder-decoder Transformer for time series forecasting and classification.
 
 Tips:
 
-<INSERT TIPS ABOUT MODEL HERE>
+- The model is trained using "teacher-forcing", similar to machine translation. This means that, during training, one provides the ground truth
+previous targets to the model rather than the model's predictions in order to predict the next target. Only at inference time, we sample from the model
+to make a prediction at each time step, which is then fed to the model in order to make the next prediction (also called autoregressive generation).
+
+This model was contributed by [kashif](<https://huggingface.co/kashif).
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](<https://huggingface.co/<INSERT YOUR HF USERNAME HERE>). The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 ## TimeSeriesTransformerConfig
 
@@ -36,3 +35,8 @@ This model was contributed by [INSERT YOUR HF USERNAME HERE](<https://huggingfac
 [[autodoc]] TimeSeriesTransformerModel
     - forward
 
+
+## TimeSeriesTransformerForPrediction
+
+[[autodoc]] TimeSeriesTransformerForPrediction
+    - forward
\ No newline at end of file
diff --git a/examples/pytorch/time-series-prediction/run_ts_no_trainer.py b/examples/pytorch/time-series-prediction/run_ts_no_trainer.py
index fc9fad5b83a8b..b72327d29d502 100644
--- a/examples/pytorch/time-series-prediction/run_ts_no_trainer.py
+++ b/examples/pytorch/time-series-prediction/run_ts_no_trainer.py
@@ -1,28 +1,39 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
 import argparse
-import json
 import logging
-import math
 import os
-import random
-from itertools import chain
 from pathlib import Path
 
 import datasets
-import torch
 from datasets import load_dataset
-from torch.utils.data import DataLoader
-from tqdm.auto import tqdm
 
 import transformers
-from accelerate import Accelerator, DistributedType
+from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
 from huggingface_hub import Repository
-from transformers import SchedulerType
+from transformers import AutoConfig, SchedulerType, TimeSeriesTransformerForPrediction
 from transformers.utils import get_full_repo_name, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
+""" Training a 🤗 Transformers model for time series prediction"""
+
+
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/time-series-prediction/requirements.txt")
 
@@ -59,7 +70,7 @@ def parse_args():
         "--prediction_length",
         type=int,
         default=None,
-        help=("The prediction horizon of the time series forecasting task."),
+        help="The prediction horizon of the time series forecasting task.",
     )
     parser.add_argument(
         "--freq",
@@ -192,3 +203,4 @@ def main():
 
     # model
     model = TimeSeriesTransformerForPrediction(config)
+    print(model)
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index a63d8d0077ae7..217e86f6224d7 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -29,7 +29,6 @@
 CONFIG_MAPPING_NAMES = OrderedDict(
     [
         # Add configs here
-        ("time_series_transformer", "TimeSeriesTransformerConfig"),
         ("albert", "AlbertConfig"),
         ("bart", "BartConfig"),
         ("beit", "BeitConfig"),
@@ -133,6 +132,7 @@
         ("swinv2", "Swinv2Config"),
         ("t5", "T5Config"),
         ("tapas", "TapasConfig"),
+        ("time_series_transformer", "TimeSeriesTransformerConfig"),
         ("trajectory_transformer", "TrajectoryTransformerConfig"),
         ("transfo-xl", "TransfoXLConfig"),
         ("trocr", "TrOCRConfig"),
@@ -165,7 +165,6 @@
 CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
     [
         # Add archive maps here)
-        ("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bart", "BART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -260,6 +259,7 @@
         ("swinv2", "SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("t5", "T5_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("tapas", "TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("unispeech", "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("unispeech-sat", "UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -286,7 +286,6 @@
 MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
-        ("time_series_transformer", "TimeSeriesTransformer"),
         ("albert", "ALBERT"),
         ("bart", "BART"),
         ("barthez", "BARThez"),
@@ -409,6 +408,7 @@
         ("t5v1.1", "T5v1.1"),
         ("tapas", "TAPAS"),
         ("tapex", "TAPEX"),
+        ("time_series_transformer", "Time Series Transformer"),
         ("trajectory_transformer", "Trajectory Transformer"),
         ("transfo-xl", "Transformer-XL"),
         ("trocr", "TrOCR"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 64e0c12fb68e2..102db34b784ca 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -28,7 +28,6 @@
 MODEL_MAPPING_NAMES = OrderedDict(
     [
         # Base model mapping
-        ("time_series_transformer", "TimeSeriesTransformerModel"),
         ("albert", "AlbertModel"),
         ("bart", "BartModel"),
         ("beit", "BeitModel"),
@@ -129,6 +128,7 @@
         ("swinv2", "Swinv2Model"),
         ("t5", "T5Model"),
         ("tapas", "TapasModel"),
+        ("time_series_transformer", "TimeSeriesTransformerModel"),
         ("trajectory_transformer", "TrajectoryTransformerModel"),
         ("transfo-xl", "TransfoXLModel"),
         ("unispeech", "UniSpeechModel"),
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 977fad7f407f9..1299ecb27ed7e 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -28,7 +28,6 @@
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
-    add_end_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
@@ -219,18 +218,22 @@ def forward(self, features: torch.Tensor) -> torch.Tensor:
 
 class MeanScaler(nn.Module):
     """
-    Computes a scaling factor as the weighted average absolute value along dimension ``dim``, and scales the data
-    accordingly. Parameters ---------- dim
-        dimension along which to compute the scale
-    keepdim
-        controls whether to retain dimension ``dim`` (of length 1) in the scale tensor, or suppress it.
-    minimum_scale
-        default scale that is used for elements that are constantly zero along dimension ``dim``.
+    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    accordingly.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-10):
+            Default scale that is used for elements that are constantly zero along dimension ``dim``.
     """
 
     def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
         super().__init__()
-        assert dim > 0, "Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0"
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
         self.dim = dim
         self.keepdim = keepdim
         self.register_buffer("minimum_scale", torch.tensor(minimum_scale))
@@ -270,10 +273,12 @@ def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tens
 class NOPScaler(nn.Module):
     """
     Assigns a scaling factor equal to 1 along dimension ``dim``, and therefore applies no scaling to the input data.
-    Parameters ---------- dim
-        dimension along which to compute the scale
-    keepdim
-        controls whether to retain dimension ``dim`` (of length 1) in the scale tensor, or suppress it.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension ``dim`` (of length 1) in the scale tensor, or suppress it.
     """
 
     def __init__(self, dim: int, keepdim: bool = False):
@@ -289,31 +294,38 @@ def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple
         return data, scale
 
 
-def _weighted_average(x: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+def _weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
     """
-    Computes the weighted average of a given tensor across a given dim, masking values associated with weight zero,
-    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`. Parameters ---------- x
-        Input tensor, of which the average must be computed.
-    weights
-        Weights tensor, of the same shape as `x`.
-    dim
-        The dim along which to average `x`
-    Returns ------- Tensor:
-        The tensor with values averaged along the specified `dim`.
+    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Args:
+        input_tensor (`torch.FloatTensor`):
+            Input tensor, of which the average must be computed.
+        weights (`torch.FloatTensor`, *optional*):
+            Weights tensor, of the same shape as `input_tensor`.
+        dim (`int`, *optional*):
+            The dim along which to average `input_tensor`.
+
+    Returns:
+        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
     """
     if weights is not None:
-        weighted_tensor = torch.where(weights != 0, x * weights, torch.zeros_like(x))
+        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
         sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
         return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
     else:
-        return x.mean(dim=dim)
+        return input_tensor.mean(dim=dim)
 
 
 class NegativeLogLikelihood:
     """
-    Compute the negative log likelihood loss. Parameters ---------- beta: float in range (0, 1)
-        beta parameter from the paper: "On the Pitfalls of Heteroscedastic Uncertainty Estimation with Probabilistic
-        Neural Networks" by Seitzer et al. 2022 https://openreview.net/forum?id=aPOpXlnV1T
+    Computes the negative log likelihood loss.
+
+    Args:
+        beta (`float`):
+            Float in range (0, 1). The beta parameter from the paper: "On the Pitfalls of Heteroscedastic Uncertainty Estimation
+            with Probabilistic Neural Networks" by [Seitzer et al. 2022](https://openreview.net/forum?id=aPOpXlnV1T).
     """
 
     beta: float = 0.0
@@ -1549,7 +1561,7 @@ def forward(
         )
 
 
-class TimeSeriesTransformerForPrediction(TimeSeriesTransformerModel):
+class TimeSeriesTransformerForPrediction(TimeSeriesTransformerPreTrainedModel):
     def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
         self.model = TimeSeriesTransformerModel(config)
@@ -1717,17 +1729,3 @@ def generate(
                 (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
             )
         )
-
-
-class TimeSeriesTransformerDecoderWrapper(TimeSeriesTransformerPreTrainedModel):
-    """
-    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
-    used in combination with the [`EncoderDecoderModel`] framework.
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.decoder = TimeSeriesTransformerDecoder(config)
-
-    def forward(self, *args, **kwargs):
-        return self.decoder(*args, **kwargs)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index e9f1bae358f3a..9be8f0a56d2ea 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4735,6 +4735,30 @@ def load_tf_weights_in_t5(*args, **kwargs):
     requires_backends(load_tf_weights_in_t5, ["torch"])
 
 
+TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TimeSeriesTransformerForPrediction(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TimeSeriesTransformerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TimeSeriesTransformerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 4701d3015bb82..ed09efddf5f98 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -15,13 +15,11 @@
 """ Testing suite for the PyTorch TimeSeriesTransformer model. """
 
 
-import copy
 import tempfile
 import unittest
 
 from transformers import is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
-from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
@@ -96,7 +94,6 @@ def prepare_config_and_inputs(self):
         # decoder inputs
         future_time_feat = floats_tensor([self.batch_size, self.prediction_length, self.num_time_features])
         future_target = floats_tensor([self.batch_size, self.prediction_length])
-        future_observed_values = floats_tensor([self.batch_size, self.prediction_length])
 
         config = TimeSeriesTransformerConfig(
             encoder_layers=self.num_hidden_layers,
@@ -384,41 +381,15 @@ def _long_tensor(tok_lst):
 @require_torch
 @slow
 class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
-    # @cached_property
-    # def default_tokenizer(self):
-    #     return TimeSeriesTransformerTokenizer.from_pretrained("huggingface/tst-ett")
-
     def test_inference_no_head(self):
         model = TimeSeriesTransformerModel.from_pretrained("huggingface/tst-ett").to(torch_device)
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        decoder_input_ids = _long_tensor([[2, 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588]])
-        inputs_dict = prepare_time_series_transformer_inputs_dict(model.config, input_ids, decoder_input_ids)
-        with torch.no_grad():
-            output = model(**inputs_dict)[0]
-        expected_shape = torch.Size((1, 11, 1024))
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = torch.tensor(
-            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
-        )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+
+        raise NotImplementedError("To do")
 
     def test_inference_head(self):
         model = TimeSeriesTransformerForPrediction.from_pretrained("huggingface/tst-ett").to(torch_device)
 
-        # change to intended input
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        inputs_dict = prepare_time_series_transformer_inputs_dict(model.config, input_ids, decoder_input_ids)
-        with torch.no_grad():
-            output = model(**inputs_dict)[0]
-        expected_shape = torch.Size((1, 11, model.config.vocab_size))
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = torch.tensor(
-            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
-        )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+        raise NotImplementedError("To do")
 
     def test_seq_to_seq_generation(self):
         raise NotImplementedError("Generation not implemented yet")

From c658f03d240094a561a69b3c4a4828ddaac645ba Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 23 Sep 2022 12:39:30 +0200
Subject: [PATCH 095/164] added test_forward_signature

---
 .../test_modeling_time_series_transformer.py  | 90 +++++++++++++------
 1 file changed, 61 insertions(+), 29 deletions(-)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 4701d3015bb82..e28f9c2942e03 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -14,14 +14,12 @@
 # limitations under the License.
 """ Testing suite for the PyTorch TimeSeriesTransformer model. """
 
-
-import copy
+import inspect
 import tempfile
 import unittest
 
 from transformers import is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
-from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
@@ -131,38 +129,38 @@ def prepare_config_and_inputs_for_common(self):
         config, inputs_dict = self.prepare_config_and_inputs()
         return config, inputs_dict
 
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = TimeSeriesTransformerModel(config=config).get_decoder().to(torch_device).eval()
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
+    # def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+    #     model = TimeSeriesTransformerModel(config=config).get_decoder().to(torch_device).eval()
+    #     input_ids = inputs_dict["input_ids"]
+    #     attention_mask = inputs_dict["attention_mask"]
 
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+    #     # first forward pass
+    #     outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
 
-        output, past_key_values = outputs.to_tuple()
+    #     output, past_key_values = outputs.to_tuple()
 
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+    #     # create hypothetical multiple next token and extent to next_input_ids
+    #     next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+    #     next_attn_mask = ids_tensor((self.batch_size, 3), 2)
 
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+    #     # append to next input_ids and
+    #     next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+    #     next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
 
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
+    #     output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+    #     output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+    #         "last_hidden_state"
+    #     ]
 
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+    #     # select random slice
+    #     random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+    #     output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+    #     output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
 
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+    #     self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
 
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
+    #     # test that outputs are equal for slice
+    #     self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
 
     def check_encoder_decoder_model_standalone(self, config, inputs_dict):
         model = TimeSeriesTransformerModel(config=config).to(torch_device).eval()
@@ -180,7 +178,7 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
         enc_input = transformer_inputs[:, : config.context_length, ...]
         dec_input = transformer_inputs[:, config.context_length :, ...]
 
-        encoder_last_hidden_state_2 = encoder(input_ids=enc_input)[0]
+        encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0]
 
         self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
 
@@ -190,7 +188,7 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
             decoder = TimeSeriesTransformerDecoder.from_pretrained(tmpdirname).to(torch_device)
 
         last_hidden_state_2 = decoder(
-            input_ids=dec_input,
+            inputs_embeds=dec_input,
             encoder_hidden_states=encoder_last_hidden_state,
         )[0]
 
@@ -245,6 +243,40 @@ def test_encoder_decoder_model_standalone(self):
     #     model.generate(input_ids, attention_mask=attention_mask)
     #     model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
 
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "feat_static_cat",
+                "feat_static_real",
+                "past_time_feat",
+                "past_target",
+                "past_observed_values",
+                "future_time_feat",
+                "future_target",
+            ]
+
+            expected_arg_names.extend(
+                [
+                    "future_observed_values",
+                    "encoder_outputs",
+                    "use_cache",
+                    "output_attentions",
+                    "output_hidden_states",
+                    "return_dict",
+                ]
+                if "future_observed_values" in arg_names
+                else ["encoder_outputs", "output_hidden_states", "use_cache", "output_attentions", "return_dict"]
+            )
+
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True

From 13de9c67eadc31bbfe4bb1e75fd66a9ec7b31d01 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Fri, 23 Sep 2022 10:46:34 +0000
Subject: [PATCH 096/164] More improvements

---
 .../run_ts_no_trainer.py                      |   8 +
 .../time-series-prediction/utils_ts.py        |   2 +-
 .../configuration_time_series_transformer.py  |  10 +-
 .../modeling_time_series_transformer.py       | 131 ++++++++++-----
 .../test_modeling_time_series_transformer.py  | 156 ++++++------------
 utils/check_repo.py                           |   2 +
 6 files changed, 160 insertions(+), 149 deletions(-)

diff --git a/examples/pytorch/time-series-prediction/run_ts_no_trainer.py b/examples/pytorch/time-series-prediction/run_ts_no_trainer.py
index b72327d29d502..e27f3cf985491 100644
--- a/examples/pytorch/time-series-prediction/run_ts_no_trainer.py
+++ b/examples/pytorch/time-series-prediction/run_ts_no_trainer.py
@@ -37,6 +37,7 @@
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/time-series-prediction/requirements.txt")
 
+
 # Parsing input arguments
 def parse_args():
     parser = argparse.ArgumentParser(
@@ -144,6 +145,8 @@ def parse_args():
     )
     args = parser.parse_args()
 
+    return args
+
 
 def main():
     args = parse_args()
@@ -203,4 +206,9 @@ def main():
 
     # model
     model = TimeSeriesTransformerForPrediction(config)
+
+    # just printing for now to make sure quality passes
+    print(args)
+    print(raw_datasets)
     print(model)
+    print(repo)
diff --git a/examples/pytorch/time-series-prediction/utils_ts.py b/examples/pytorch/time-series-prediction/utils_ts.py
index 382dca41e84d4..e1dfc1cfd19c9 100644
--- a/examples/pytorch/time-series-prediction/utils_ts.py
+++ b/examples/pytorch/time-series-prediction/utils_ts.py
@@ -15,7 +15,7 @@
 """ Transformations Utilities for Time Series Transformers. """
 
 from functools import lru_cache
-from typing import Iterable, List, Optional
+from typing import Iterable, Optional
 
 import pandas as pd
 from torch.utils.data import DataLoader
diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 703992e6fd3b7..3357f395d6fe2 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TimeSeriesTransformer model configuration"""
+""" Time Series Transformer model configuration"""
 from typing import List, Optional
 
 from ...configuration_utils import PretrainedConfig
@@ -30,9 +30,9 @@
 class TimeSeriesTransformerConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`~TimeSeriesTransformerModel`]. It is used to
-    instantiate a TimeSeriesTransformer model according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the
-    TimeSeriesTransformer [huggingface/tst-ett](https://huggingface.co/huggingface/tst-ett) architecture.
+    instantiate a Time Series Transformer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Time Series
+    Transformer [huggingface/tst-ett](https://huggingface.co/huggingface/tst-ett) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -105,7 +105,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
     ```python
     >>> from transformers import TimeSeriesTransformerConfig, TimeSeriesTransformerModel
 
-    >>> # Initializing a TimeSeriesTransformer huggingface/tst-ett style configuration
+    >>> # Initializing a Time Series Transformer huggingface/tst-ett style configuration
     >>> configuration = TimeSeriesTransformerConfig()
 
     >>> # Initializing a model from the huggingface/tst-ett style configuration
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 1299ecb27ed7e..f96d4dbe2cfe0 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch TimeSeriesTransformer model."""
+""" PyTorch Time Series Transformer model."""
 
 import random
 from dataclasses import dataclass
@@ -26,13 +26,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_time_series_transformer import TimeSeriesTransformerConfig
 
 
@@ -294,7 +288,7 @@ def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple
         return data, scale
 
 
-def _weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
     """
     Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
     meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
@@ -324,8 +318,9 @@ class NegativeLogLikelihood:
 
     Args:
         beta (`float`):
-            Float in range (0, 1). The beta parameter from the paper: "On the Pitfalls of Heteroscedastic Uncertainty Estimation
-            with Probabilistic Neural Networks" by [Seitzer et al. 2022](https://openreview.net/forum?id=aPOpXlnV1T).
+            Float in range (0, 1). The beta parameter from the paper: "On the Pitfalls of Heteroscedastic Uncertainty
+            Estimation with Probabilistic Neural Networks" by [Seitzer et al.
+            2022](https://openreview.net/forum?id=aPOpXlnV1T).
     """
 
     beta: float = 0.0
@@ -848,23 +843,6 @@ def _set_gradient_checkpointing(self, module, value=False):
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-TIME_SERIES_TRANSFORMER_PREDICTION_EXAMPLE = r"""
-    Summarization example:
-
-    ```python
-    >>> from transformers import TimeSeriesTransformerForPrediction
-
-    >>> model = TimeSeriesTransformerForConditionalGeneration.from_pretrained("huggingface/tst-ett")
-
-    >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
-    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")
-
-    >>> # Generate Summary
-    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5)
-    >>> print(tokenizer.decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
-    ```
-"""
-
 TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -1481,11 +1459,7 @@ def get_decoder(self):
         return self.decoder
 
     @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=Seq2SeqTSModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
+    @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         feat_static_cat: torch.Tensor,
@@ -1501,6 +1475,47 @@ def forward(
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import TimeSeriesTransformerModel
+        >>> import torch
+
+        >>> model = TimeSeriesTransformerModel.from_pretrained("huggingface/tst-base")
+
+        >>> inputs = dict()
+        >>> batch_size = 2
+        >>> cardinality = 5
+        >>> num_time_features = 10
+        >>> content_length = 8
+        >>> prediction_length = 2
+        >>> lags_seq = [2, 3]
+        >>> past_length = context_length + max(lags_seq)
+
+        >>> # encoder inputs
+        >>> inputs["feat_static_cat"] = ids_tensor([batch_size, 1], cardinality)
+        >>> inputs["feat_static_real"] = torch.randn([batch_size, 1])
+        >>> inputs["past_time_feat"] = torch.randn([batch_size, past_length, num_time_features])
+        >>> inputs["past_target"] = torch.randn([batch_size, past_length])
+        >>> inputs["past_observed_values"] = torch.randn([batch_size, past_length])
+
+        >>> # decoder inputs
+        >>> inputs["future_time_feat"] = torch.randn([batch_size, prediction_length, num_time_features])
+        >>> inputs["future_target"] = torch.randn([batch_size, prediction_length])
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         transformer_inputs, scale, static_feat = self.create_network_inputs(
             feat_static_cat=feat_static_cat,
             feat_static_real=feat_static_real,
@@ -1511,13 +1526,6 @@ def forward(
             future_target=future_target,
         )
 
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
         if encoder_outputs is None:
             enc_input = transformer_inputs[:, : self.config.context_length, ...]
             encoder_outputs = self.encoder(
@@ -1561,6 +1569,10 @@ def forward(
         )
 
 
+@add_start_docstrings(
+    "The Time Series Transformer Model with a distribution head on top for time-series forecasting.",
+    TIME_SERIES_TRANSFORMER_START_DOCSTRING,
+)
 class TimeSeriesTransformerForPrediction(TimeSeriesTransformerPreTrainedModel):
     def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
@@ -1589,6 +1601,8 @@ def output_distribution(self, params, scale=None, trailing_n=None) -> torch.dist
             sliced_params = [p[:, -trailing_n:] for p in params]
         return self.distribution_output.distribution(sliced_params, scale=scale)
 
+    @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         feat_static_cat: torch.Tensor,
@@ -1604,6 +1618,41 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import TimeSeriesTransformerForPrediction
+        >>> import torch
+
+        >>> model = TimeSeriesTransformerForPrediction.from_pretrained("huggingface/tst-base")
+
+        >>> inputs = dict()
+        >>> batch_size = 2
+        >>> cardinality = 5
+        >>> num_time_features = 10
+        >>> content_length = 8
+        >>> prediction_length = 2
+        >>> lags_seq = [2, 3]
+        >>> past_length = context_length + max(lags_seq)
+
+        >>> # encoder inputs
+        >>> inputs["feat_static_cat"] = ids_tensor([batch_size, 1], cardinality)
+        >>> inputs["feat_static_real"] = torch.randn([batch_size, 1])
+        >>> inputs["past_time_feat"] = torch.randn([batch_size, past_length, num_time_features])
+        >>> inputs["past_target"] = torch.randn([batch_size, past_length])
+        >>> inputs["past_observed_values"] = torch.randn([batch_size, past_length])
+
+        >>> # decoder inputs
+        >>> inputs["future_time_feat"] = torch.randn([batch_size, prediction_length, num_time_features])
+        >>> inputs["future_target"] = torch.randn([batch_size, prediction_length])
+
+        >>> outputs = model(**inputs)
+        >>> loss = outputs.loss
+        ```"""
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if future_target is not None:
             use_cache = False
@@ -1635,7 +1684,7 @@ def forward(
             else:
                 loss_weights = future_observed_values.min(dim=-1, keepdim=False)
 
-            prediction_loss = _weighted_average(loss, weights=loss_weights)
+            prediction_loss = weighted_average(loss, weights=loss_weights)
 
         if not return_dict:
             outputs = (params + outputs[1:]) if params is not None else outputs[1:]
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index ed09efddf5f98..a81ca5fd0f5c2 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -251,9 +251,6 @@ def test_attention_outputs(self):
         encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
         decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
         encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
 
         for model_class in self.all_model_classes:
             inputs_dict["output_attentions"] = True
@@ -275,119 +272,74 @@ def test_attention_outputs(self):
             model.eval()
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            attentions = outputs.encoder_attentions
             self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
 
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
             out_len = len(outputs)
 
-            if self.is_encoder_decoder:
-                correct_outlen = 6
-
-                if "last_hidden_state" in outputs:
-                    correct_outlen += 1
-
-                self.assertEqual(out_len, correct_outlen)
-
-                # decoder attentions
-                decoder_attentions = outputs.decoder_attentions
-                self.assertIsInstance(decoder_attentions, (list, tuple))
-                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                )
-
-                # cross attentions
-                cross_attentions = outputs.cross_attentions
-                self.assertIsInstance(cross_attentions, (list, tuple))
-                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(cross_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        decoder_seq_length,
-                        encoder_key_length,
-                    ],
-                )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-
-def assert_tensors_close(a, b, atol=1e-12, prefix=""):
-    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
-    if a is None and b is None:
-        return True
-    try:
-        if torch.allclose(a, b, atol=atol):
-            return True
-        raise
-    except Exception:
-        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
-        if a.numel() > 100:
-            msg = f"tensor values are {pct_different:.1%} percent different."
-        else:
-            msg = f"{a} != {b}"
-        if prefix:
-            msg = prefix + ": " + msg
-        raise AssertionError(msg)
-
-
-def _long_tensor(tok_lst):
-    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
-
-
-TOLERANCE = 1e-4
+            correct_outlen = 6
+
+            if "last_hidden_state" in outputs:
+                correct_outlen += 1
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    decoder_seq_length,
+                    encoder_key_length,
+                ],
+            )
+
+        # Check attention is always last and order is fine
+        inputs_dict["output_attentions"] = True
+        inputs_dict["output_hidden_states"] = True
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+        self.assertEqual(out_len + 2, len(outputs))
+
+        self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+        self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+        self.assertListEqual(
+            list(self_attentions[0].shape[-3:]),
+            [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+        )
 
 
 @require_torch
 @slow
 class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
     def test_inference_no_head(self):
-        model = TimeSeriesTransformerModel.from_pretrained("huggingface/tst-ett").to(torch_device)
+        # model = TimeSeriesTransformerModel.from_pretrained("huggingface/tst-ett").to(torch_device)
 
         raise NotImplementedError("To do")
 
     def test_inference_head(self):
-        model = TimeSeriesTransformerForPrediction.from_pretrained("huggingface/tst-ett").to(torch_device)
+        # model = TimeSeriesTransformerForPrediction.from_pretrained("huggingface/tst-ett").to(torch_device)
 
         raise NotImplementedError("To do")
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 988967e797d12..9b63515b62ca9 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -46,6 +46,8 @@
 # Being in this list is an exception and should **not** be the rule.
 IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
     # models to ignore for not tested
+    "TimeSeriesTransformerEncoder",  # Building part of bigger (tested) model.
+    "TimeSeriesTransformerDecoder",  # Building part of bigger (tested) model.
     "DeformableDetrEncoder",  # Building part of bigger (tested) model.
     "DeformableDetrDecoder",  # Building part of bigger (tested) model.
     "OPTDecoder",  # Building part of bigger (tested) model.

From 649373fc8b9daf4c3d3c096ecdbc419bf1b27f8d Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Fri, 23 Sep 2022 11:30:43 +0000
Subject: [PATCH 097/164] Add more copied from

---
 .../modeling_time_series_transformer.py       | 39 +++----------------
 1 file changed, 6 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index f96d4dbe2cfe0..b16e1e2b32f15 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -19,7 +19,6 @@
 from typing import Callable, Dict, List, Optional, Tuple
 
 import torch
-import torch.nn.functional as F
 from torch import nn
 from torch.distributions import AffineTransform, Distribution, StudentT, TransformedDistribution
 
@@ -173,8 +172,8 @@ class StudentTOutput(DistributionOutput):
 
     @classmethod
     def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
-        scale = F.softplus(scale)
-        df = 2.0 + F.softplus(df)
+        scale = nn.functional.softplus(scale)
+        df = 2.0 + nn.functional.softplus(df)
         return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
 
     @property
@@ -183,11 +182,7 @@ def event_shape(self) -> Tuple:
 
 
 class FeatureEmbedder(nn.Module):
-    def __init__(
-        self,
-        cardinalities: List[int],
-        embedding_dims: List[int],
-    ) -> None:
+    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
         super().__init__()
 
         self.num_features = len(cardinalities)
@@ -333,6 +328,7 @@ def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor
         return nll
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
     """
     Make causal mask used for bi-directional self-attention.
@@ -348,6 +344,7 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
+# Copied from transformers.models.bart.modeling_bart._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
@@ -359,7 +356,7 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 
     inverted_mask = 1.0 - expanded_mask
 
-    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 
 
 @dataclass
@@ -782,30 +779,6 @@ def forward(
         return outputs
 
 
-class TimeSeriesTransformerClassificationHead(nn.Module):
-    """Head for sentence-level classification tasks."""
-
-    def __init__(
-        self,
-        input_dim: int,
-        inner_dim: int,
-        num_classes: int,
-        pooler_dropout: float,
-    ):
-        super().__init__()
-        self.dense = nn.Linear(input_dim, inner_dim)
-        self.dropout = nn.Dropout(p=pooler_dropout)
-        self.out_proj = nn.Linear(inner_dim, num_classes)
-
-    def forward(self, hidden_states: torch.Tensor):
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.dense(hidden_states)
-        hidden_states = torch.tanh(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.out_proj(hidden_states)
-        return hidden_states
-
-
 class TimeSeriesTransformerPreTrainedModel(PreTrainedModel):
     config_class = TimeSeriesTransformerConfig
     base_model_prefix = "model"

From 08d81f53e8481fc8c87075a5064aba6e4f5cfd33 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Fri, 23 Sep 2022 11:35:13 +0000
Subject: [PATCH 098/164] Fix README

---
 README.md                | 3 +--
 README_ko.md             | 1 -
 README_zh-hans.md        | 1 -
 README_zh-hant.md        | 1 -
 docs/source/en/index.mdx | 3 +--
 5 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 7395c30e92935..5c6a7fdf8fb74 100644
--- a/README.md
+++ b/README.md
@@ -373,8 +373,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Time Series Transformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
-1. **[TimeSeriesTransformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[Time Series Transformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)**  (from HuggingFace).
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
diff --git a/README_ko.md b/README_ko.md
index 790ff54d83fdf..eb6f73df6ba84 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -324,7 +324,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from HuggingFace). 
-1. **[TimeSeriesTransformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 9c6c71e0129c8..c6da70005ab55 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -348,7 +348,6 @@ conda install -c huggingface transformers
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from HuggingFace). 
-1. **[TimeSeriesTransformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 25bd8d6d3a9e5..c2b520f6e19b9 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -360,7 +360,6 @@ conda install -c huggingface transformers
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from HuggingFace). 
-1. **[TimeSeriesTransformer](https://huggingface.co/docs/transformers/main/model_doc/time_series_transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 25c0265b10e35..2c5340d90fabf 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -163,8 +163,7 @@ The documentation is organized into five sections:
 1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Time Series Transformer](model_doc/time_series_transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
-1. **[TimeSeriesTransformer](model_doc/time_series_transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[Time Series Transformer](model_doc/time_series_transformer)**  (from HuggingFace).
 1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.

From 390db662faffb237d9df5d010de37b4ba1714bca Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Fri, 23 Sep 2022 11:47:25 +0000
Subject: [PATCH 099/164] Fix remaining quality issues

---
 .../modeling_time_series_transformer.py                     | 6 +++---
 utils/check_repo.py                                         | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index b16e1e2b32f15..2d6c17fa1d19c 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -216,7 +216,7 @@ class MeanScaler(nn.Module):
         keepdim (`bool`, *optional*, defaults to `False`):
             Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
         minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default scale that is used for elements that are constantly zero along dimension ``dim``.
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
     """
 
     def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
@@ -261,13 +261,13 @@ def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tens
 
 class NOPScaler(nn.Module):
     """
-    Assigns a scaling factor equal to 1 along dimension ``dim``, and therefore applies no scaling to the input data.
+    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
 
     Args:
         dim (`int`):
             Dimension along which to compute the scale.
         keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension ``dim`` (of length 1) in the scale tensor, or suppress it.
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
     """
 
     def __init__(self, dim: int, keepdim: bool = False):
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 9b63515b62ca9..7d05e837b2a68 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -134,6 +134,7 @@
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     # models to ignore for model xxx mapping
+    "TimeSeriesTransformerForPrediction",
     "PegasusXEncoder",
     "PegasusXDecoder",
     "PegasusXDecoderWrapper",

From 973baf3f4f98619e042a8e4927d83aa5b61660a3 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 23 Sep 2022 14:31:22 +0200
Subject: [PATCH 100/164] updated encoder and decoder

---
 .../modeling_time_series_transformer.py       | 87 ++++++++++---------
 1 file changed, 47 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 75411cfb8e64e..1863db2a0e379 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -16,7 +16,7 @@
 
 import random
 from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -956,16 +956,15 @@ def __init__(self, config: TimeSeriesTransformerConfig):
 
     def forward(
         self,
-        inputs_embeds: torch.Tensor,
-        attention_mask=None,
-        head_mask=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
         r"""
         Args:
-            inputs_embeds
             attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -998,8 +997,7 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        hidden_states = inputs_embeds  # + embed_pos
-
+        hidden_states = inputs_embeds
         hidden_states = self.layernorm_embedding(hidden_states)
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
@@ -1013,9 +1011,11 @@ def forward(
 
         # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layers)
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
 
         for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
@@ -1090,11 +1090,13 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
                 input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(self.device)
+            ).to(inputs_embeds.device)
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
             combined_attention_mask = (
                 expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
             )
@@ -1103,21 +1105,20 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
     def forward(
         self,
-        inputs_embeds: torch.Tensor,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        head_mask=None,
-        cross_attn_head_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
         r"""
         Args:
-            inputs_embeds
             attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -1143,7 +1144,8 @@ def forward(
                 - 0 indicates the head is **masked**.
 
             cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
 
                 - 1 indicates the head is **not masked**,
                 - 0 indicates the head is **masked**.
@@ -1158,8 +1160,8 @@ def forward(
 
                 If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                 that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
-                all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor`
-                of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
+                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
                 `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
                 control over how to convert `input_ids` indices into associated vectors than the model's internal
                 embedding lookup matrix.
@@ -1193,7 +1195,7 @@ def forward(
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
 
-        hidden_states = inputs_embeds  # + positions
+        hidden_states = inputs_embeds
         hidden_states = self.layernorm_embedding(hidden_states)
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
@@ -1207,10 +1209,12 @@ def forward(
         # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
         for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
             if attn_mask is not None:
-                assert attn_mask.size()[0] == (len(self.layers)), (
-                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
-                    f" {head_mask.size()[0]}."
-                )
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+
         for idx, decoder_layer in enumerate(self.layers):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
@@ -1222,10 +1226,10 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
+
                 if use_cache:
                     logger.warning(
-                        "`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache ="
-                        " False`..."
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                     )
                     use_cache = False
 
@@ -1247,13 +1251,16 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
+
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
                     encoder_hidden_states=encoder_hidden_states,
                     encoder_attention_mask=encoder_attention_mask,
                     layer_head_mask=(head_mask[idx] if head_mask is not None else None),
-                    cross_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
                     past_key_value=past_key_value,
                     output_attentions=output_attentions,
                     use_cache=use_cache,

From ea138ff37df3bed861f4ccb45a467c74d1cf1c7b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 23 Sep 2022 15:02:12 +0200
Subject: [PATCH 101/164] fix generate

---
 .../time_series_transformer/modeling_time_series_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 1863db2a0e379..006460bd1b55b 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1732,7 +1732,7 @@ def generate(
 
         # greedy decoding
         for k in range(self.config.prediction_length):
-            lagged_sequence = self.get_lagged_subsequences(
+            lagged_sequence = self.model.get_lagged_subsequences(
                 sequence=repeated_past_target,
                 subsequences_length=1 + k,
                 shift=1,

From 90767575cc205ae23a330334c88b2b05add30792 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 23 Sep 2022 15:53:43 +0200
Subject: [PATCH 102/164] output_hidden_states and use_cache are optional

---
 .../modeling_time_series_transformer.py                       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 006460bd1b55b..a80fcfaafca40 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1450,8 +1450,8 @@ def forward(
         future_time_feat: Optional[torch.Tensor] = None,
         future_target: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-        output_hidden_states: bool = False,
-        use_cache: bool = False,
+        output_hidden_states: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ):

From 409a88ab15a1bc18bd2804a3c48e08e5f4e4c388 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 23 Sep 2022 16:01:34 +0200
Subject: [PATCH 103/164] past key_values returned too

---
 .../test_modeling_time_series_transformer.py                   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index d275516b839d4..2cb5ab86d1b9d 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -320,6 +320,9 @@ def test_attention_outputs(self):
             if "last_hidden_state" in outputs:
                 correct_outlen += 1
 
+            if "past_key_values" in outputs:
+                correct_outlen += 1  # past_key_values have been returned
+
             self.assertEqual(out_len, correct_outlen)
 
             # decoder attentions

From ed6b62acc5293ad228a1e371a5f433619df7d4ad Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 23 Sep 2022 16:10:51 +0200
Subject: [PATCH 104/164] initialize weights of distribution_output module

---
 .../modeling_time_series_transformer.py                        | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index a80fcfaafca40..bde2357fff403 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1565,6 +1565,9 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         if config.loss == "nll":
             self.loss = NegativeLogLikelihood()
 
+        # Initialize weights of distribution_output and apply final processing
+        self.post_init()
+
     def output_params(self, dec_output):
         return self.param_proj(dec_output)
 

From 1ef37e4b8f7afa10a4bbe8b88743b7bfaf44feb0 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 23 Sep 2022 21:17:51 +0200
Subject: [PATCH 105/164] fixed more tests

---
 .../modeling_time_series_transformer.py       | 29 +++++++++++++++----
 .../test_modeling_time_series_transformer.py  | 25 ++++++++++++++--
 2 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index bde2357fff403..5b29f15b8e202 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -782,6 +782,7 @@ def forward(
 class TimeSeriesTransformerPreTrainedModel(PreTrainedModel):
     config_class = TimeSeriesTransformerConfig
     base_model_prefix = "model"
+    main_input_name = "past_target"
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
@@ -1442,14 +1443,20 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
+        past_target: torch.Tensor,
         feat_static_cat: torch.Tensor,
         feat_static_real: torch.Tensor,
         past_time_feat: torch.Tensor,
-        past_target: torch.Tensor,
         past_observed_values: torch.Tensor,
         future_time_feat: Optional[torch.Tensor] = None,
         future_target: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
         output_hidden_states: Optional[bool] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1510,6 +1517,8 @@ def forward(
             enc_input = transformer_inputs[:, : self.config.context_length, ...]
             encoder_outputs = self.encoder(
                 inputs_embeds=enc_input,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
@@ -1525,7 +1534,12 @@ def forward(
         dec_input = transformer_inputs[:, self.config.context_length :, ...]
         decoder_outputs = self.decoder(
             inputs_embeds=dec_input,
+            attention_mask=decoder_attention_mask,
             encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -1588,10 +1602,10 @@ def output_distribution(self, params, scale=None, trailing_n=None) -> torch.dist
     @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
+        past_target: torch.Tensor,
         feat_static_cat: torch.Tensor,
         feat_static_real: torch.Tensor,
         past_time_feat: torch.Tensor,
-        past_target: torch.Tensor,
         past_observed_values: torch.Tensor,
         future_time_feat: Optional[torch.Tensor] = None,
         future_target: Optional[torch.Tensor] = None,
@@ -1642,10 +1656,10 @@ def forward(
             use_cache = False
 
         outputs = self.model(
+            past_target=past_target,
             feat_static_cat=feat_static_cat,
             feat_static_real=feat_static_real,
             past_time_feat=past_time_feat,
-            past_target=past_target,
             past_observed_values=past_observed_values,
             future_time_feat=future_time_feat,
             future_target=future_target,
@@ -1658,12 +1672,15 @@ def forward(
 
         prediction_loss = None
         params = None
-        if future_target is not None and future_observed_values is not None:
-            params = self.output_params(outputs.last_hidden_state)
-            distr = self.output_distribution(params, outputs.scale)
+        if future_target is not None:
+            params = self.output_params(outputs[0])  # outputs.last_hidden_state
+            distr = self.output_distribution(params, outputs[-2])  # outputs.scale
 
             loss = self.loss(distr, future_target)
 
+            if future_observed_values is None:
+                future_observed_values = torch.ones_like(future_target)
+
             if len(self.target_shape) == 0:
                 loss_weights = future_observed_values
             else:
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 2cb5ab86d1b9d..4526d825cc548 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -14,11 +14,13 @@
 # limitations under the License.
 """ Testing suite for the PyTorch TimeSeriesTransformer model. """
 
+import copy
 import inspect
 import tempfile
 import unittest
 
-from transformers import is_torch_available
+from transformers import is_torch_available, MODEL_MAPPING
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
@@ -210,6 +212,8 @@ class TimeSeriesTransformerModelTest(ModelTesterMixin, unittest.TestCase):
     test_head_masking = False
     test_missing_keys = False
     test_torchscript = False
+    test_inputs_embeds = False
+    test_model_common_attributes = False
 
     def setUp(self):
         self.model_tester = TimeSeriesTransformerModelTester(self)
@@ -242,6 +246,17 @@ def test_encoder_decoder_model_standalone(self):
     #     model.generate(input_ids, attention_mask=attention_mask)
     #     model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
 
+    # Ignore since we have no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # # Input is 'feat_static_cat' not 'input_ids'
+    def test_model_main_input_name(self):
+        model_signature = inspect.signature(getattr(TimeSeriesTransformerModel, "forward"))
+        # The main input is the name of the argument after `self`
+        observed_main_input_name = list(model_signature.parameters.keys())[1]
+        self.assertEqual(TimeSeriesTransformerModel.main_input_name, observed_main_input_name)
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -252,10 +267,10 @@ def test_forward_signature(self):
             arg_names = [*signature.parameters.keys()]
 
             expected_arg_names = [
+                "past_target",
                 "feat_static_cat",
                 "feat_static_real",
                 "past_time_feat",
-                "past_target",
                 "past_observed_values",
                 "future_time_feat",
                 "future_target",
@@ -323,6 +338,12 @@ def test_attention_outputs(self):
             if "past_key_values" in outputs:
                 correct_outlen += 1  # past_key_values have been returned
 
+            if "loss" in outputs:
+                correct_outlen += 1
+
+            if "params" in outputs:
+                correct_outlen += 1
+
             self.assertEqual(out_len, correct_outlen)
 
             # decoder attentions

From b6b6be3c3a053c9e8faae139010542f68f5e12db Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 26 Sep 2022 12:53:39 +0200
Subject: [PATCH 106/164] update test_forward_signature

---
 .../test_modeling_time_series_transformer.py       | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 4526d825cc548..cd1768d06cdfb 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -286,7 +286,19 @@ def test_forward_signature(self):
                     "return_dict",
                 ]
                 if "future_observed_values" in arg_names
-                else ["encoder_outputs", "output_hidden_states", "use_cache", "output_attentions", "return_dict"]
+                else [
+                    "attention_mask",
+                    "decoder_attention_mask",
+                    "head_mask",
+                    "decoder_head_mask",
+                    "cross_attn_head_mask",
+                    "encoder_outputs",
+                    "past_key_values",
+                    "output_hidden_states",
+                    "use_cache",
+                    "output_attentions",
+                    "return_dict",
+                ]
             )
 
             self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)

From 0e7d3ed348eaba19bb5387a434852b7a214f3d0e Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 26 Sep 2022 15:05:54 +0200
Subject: [PATCH 107/164] fix return_dict outputs

---
 .../modeling_time_series_transformer.py                       | 2 +-
 .../test_modeling_time_series_transformer.py                  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 5b29f15b8e202..99134ce9e998c 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1689,7 +1689,7 @@ def forward(
             prediction_loss = weighted_average(loss, weights=loss_weights)
 
         if not return_dict:
-            outputs = (params + outputs[1:]) if params is not None else outputs[1:]
+            outputs = ((params,) + outputs[1:]) if params is not None else outputs[1:]
             return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
 
         return Seq2SeqTSPredictionOutput(
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index cd1768d06cdfb..5daf018302e5e 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -19,7 +19,7 @@
 import tempfile
 import unittest
 
-from transformers import is_torch_available, MODEL_MAPPING
+from transformers import MODEL_MAPPING, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 
@@ -116,10 +116,10 @@ def prepare_config_and_inputs(self):
         )
 
         inputs_dict = {
+            "past_target": past_target,
             "feat_static_cat": feat_static_cat,
             "feat_static_real": feat_static_real,
             "past_time_feat": past_time_feat,
-            "past_target": past_target,
             "future_time_feat": future_time_feat,
             "past_observed_values": past_observed_values,
             "future_target": future_target,

From eb3ef8af289dfacddf08df22554fede8aba4e3dd Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 26 Sep 2022 20:33:06 +0200
Subject: [PATCH 108/164] Update
 src/transformers/models/time_series_transformer/configuration_time_series_transformer.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../configuration_time_series_transformer.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 3357f395d6fe2..dbd8ab1eea33b 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -29,7 +29,7 @@
 
 class TimeSeriesTransformerConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`~TimeSeriesTransformerModel`]. It is used to
+    This is the configuration class to store the configuration of a [`TimeSeriesTransformerModel`]. It is used to
     instantiate a Time Series Transformer model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the Time Series
     Transformer [huggingface/tst-ett](https://huggingface.co/huggingface/tst-ett) architecture.

From ab05cf5dc27fb98906945aa33aaf23dc72a1a1db Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 26 Sep 2022 20:33:24 +0200
Subject: [PATCH 109/164] Update
 src/transformers/models/time_series_transformer/configuration_time_series_transformer.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../configuration_time_series_transformer.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index dbd8ab1eea33b..86448946a5903 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -43,7 +43,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
         context_length (`int`, *optional*):
             The context length for the encoder. If `None`, the context length will be the same as the
             `prediction_length`.
-        distribution_output (`string`, *optional* defaults to `student_t`):
+        distribution_output (`string`, *optional* defaults to `"student_t"`):
             The distribution emission head for the model.
         loss (`string`, *optional* defaults to `nll`):
             The loss function for the model corresponding to the `distribution_output` head. For parametric

From 60a63bf4bfc6d5b95bda81d3cb550ef7037a3af7 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 26 Sep 2022 20:33:47 +0200
Subject: [PATCH 110/164] Update
 src/transformers/models/time_series_transformer/configuration_time_series_transformer.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../configuration_time_series_transformer.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 86448946a5903..07aa0c828a4d8 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -45,7 +45,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             `prediction_length`.
         distribution_output (`string`, *optional* defaults to `"student_t"`):
             The distribution emission head for the model.
-        loss (`string`, *optional* defaults to `nll`):
+        loss (`string`, *optional* defaults to `"nll"`):
             The loss function for the model corresponding to the `distribution_output` head. For parametric
             distributions it is negative log likelihood.
         input_size (`int`, *optional* defaults to 1):

From e8663f09906b5836f97335d4e8536ee31a1c1676 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 26 Sep 2022 20:35:13 +0200
Subject: [PATCH 111/164] Update
 src/transformers/models/time_series_transformer/configuration_time_series_transformer.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../configuration_time_series_transformer.py                | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 07aa0c828a4d8..3e5a42ac1bbe5 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -57,11 +57,11 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             5, 6, 7]`.
         num_time_features (`int`, *optional* defaults to 0):
             The number of time features in the input time series.
-        num_feat_dynamic_real (`int`, *optional* defaults to `0`):
+        num_feat_dynamic_real (`int`, *optional* defaults to 0):
             The number of dynamic real valued features.
-        num_feat_static_cat (`int`, *optional* defaults to `0`):
+        num_feat_static_cat (`int`, *optional* defaults to 0):
             The number of static categorical features.
-        num_feat_static_real (`int`, *optional* defaults to `0`):
+        num_feat_static_real (`int`, *optional* defaults to 0):
             The number of static real valued features.
         cardinality (`list` of `int`, *optional*):
             The cardinality of the categorical features. Cannot be `None` if `num_feat_static_cat` is `> 0`.

From 4b461217947dca8367e644550c3c7a5bcd434a06 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 26 Sep 2022 20:37:27 +0200
Subject: [PATCH 112/164] Update
 src/transformers/models/time_series_transformer/modeling_time_series_transformer.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../time_series_transformer/modeling_time_series_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 99134ce9e998c..0753e34fff575 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -811,7 +811,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     and behavior.
 
     Parameters:
-        config ([`~TimeSeriesTransformerConfig`]):
+        config ([`TimeSeriesTransformerConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.

From 3ab0bfd7cf6e5f5ede7c329e8a34820f6e19fda8 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 26 Sep 2022 20:37:52 +0200
Subject: [PATCH 113/164] Update
 src/transformers/models/time_series_transformer/modeling_time_series_transformer.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../modeling_time_series_transformer.py                        | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 0753e34fff575..21a2156f50b2e 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1412,8 +1412,7 @@ def create_network_inputs(
 
         # sequence = torch.cat((prior_input, inputs), dim=1)
         lagged_sequence = self.get_lagged_subsequences(
-            sequence=inputs,
-            subsequences_length=subsequences_length,
+            sequence=inputs, subsequences_length=subsequences_length
         )
 
         lags_shape = lagged_sequence.shape

From ac3e8d85ae54b0083e829cb3167a9e111361c344 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 26 Sep 2022 21:55:41 +0200
Subject: [PATCH 114/164] Update
 src/transformers/models/time_series_transformer/modeling_time_series_transformer.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../modeling_time_series_transformer.py                      | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 21a2156f50b2e..c986f75f37d8d 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -276,10 +276,7 @@ def __init__(self, dim: int, keepdim: bool = False):
         self.keepdim = keepdim
 
     def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        scale = torch.ones_like(data).mean(
-            dim=self.dim,
-            keepdim=self.keepdim,
-        )
+        scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
         return data, scale
 
 

From 9d9ed9ac8050b310d63983858ce02e2410337c0d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 27 Sep 2022 11:52:37 +0200
Subject: [PATCH 115/164] removed commented out tests

---
 .../test_modeling_time_series_transformer.py  | 49 ++-----------------
 1 file changed, 4 insertions(+), 45 deletions(-)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 5daf018302e5e..bad206f5c766c 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -130,39 +130,6 @@ def prepare_config_and_inputs_for_common(self):
         config, inputs_dict = self.prepare_config_and_inputs()
         return config, inputs_dict
 
-    # def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-    #     model = TimeSeriesTransformerModel(config=config).get_decoder().to(torch_device).eval()
-    #     input_ids = inputs_dict["input_ids"]
-    #     attention_mask = inputs_dict["attention_mask"]
-
-    #     # first forward pass
-    #     outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-    #     output, past_key_values = outputs.to_tuple()
-
-    #     # create hypothetical multiple next token and extent to next_input_ids
-    #     next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-    #     next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-    #     # append to next input_ids and
-    #     next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-    #     next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
-
-    #     output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-    #     output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-    #         "last_hidden_state"
-    #     ]
-
-    #     # select random slice
-    #     random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-    #     output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-    #     output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-    #     self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-    #     # test that outputs are equal for slice
-    #     self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
-
     def check_encoder_decoder_model_standalone(self, config, inputs_dict):
         model = TimeSeriesTransformerModel(config=config).to(torch_device).eval()
         outputs = model(**inputs_dict)
@@ -236,16 +203,6 @@ def test_encoder_decoder_model_standalone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
 
-    # def test_generate_fp16(self):
-    #     config, input_dict = self.model_tester.prepare_config_and_inputs()
-    #     input_ids = input_dict["input_ids"]
-    #     attention_mask = input_ids.ne(1).to(torch_device)
-    #     model = TimeSeriesTransformerForPrediction(config).eval().to(torch_device)
-    #     if torch_device == "cuda":
-    #         model.half()
-    #     model.generate(input_ids, attention_mask=attention_mask)
-    #     model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
     # Ignore since we have no tokens embeddings
     def test_resize_tokens_embeddings(self):
         pass
@@ -404,12 +361,14 @@ def test_attention_outputs(self):
 @slow
 class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
     def test_inference_no_head(self):
-        # model = TimeSeriesTransformerModel.from_pretrained("huggingface/tst-ett").to(torch_device)
+        model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
+            torch_device
+        )
 
         raise NotImplementedError("To do")
 
     def test_inference_head(self):
-        # model = TimeSeriesTransformerForPrediction.from_pretrained("huggingface/tst-ett").to(torch_device)
+        # model = TimeSeriesTransformerForPrediction.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(torch_device)
 
         raise NotImplementedError("To do")
 

From d631896995d3832de821707f15c776ad4dc69534 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 27 Sep 2022 12:06:06 +0200
Subject: [PATCH 116/164] added neg. bin and normal output

---
 .../modeling_time_series_transformer.py       | 173 +++++++++++++++---
 1 file changed, 147 insertions(+), 26 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index c986f75f37d8d..cb86e37976820 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -20,7 +20,14 @@
 
 import torch
 from torch import nn
-from torch.distributions import AffineTransform, Distribution, StudentT, TransformedDistribution
+from torch.distributions import (
+    AffineTransform,
+    Distribution,
+    NegativeBinomial,
+    Normal,
+    StudentT,
+    TransformedDistribution,
+)
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
@@ -98,24 +105,11 @@ def forward(self, x, *args):
         return self.function(x, *args)
 
 
-class Output:
+class DistributionOutput:
+    distr_cls: type
     in_features: int
     args_dim: Dict[str, int]
 
-    def get_param_proj(self, in_features: int) -> nn.Module:
-        return ParameterProjection(
-            in_features=in_features,
-            args_dim=self.args_dim,
-            domain_map=LambdaLayer(self.domain_map),
-        )
-
-    def domain_map(self, *args: torch.Tensor):
-        raise NotImplementedError()
-
-
-class DistributionOutput(Output):
-    distr_cls: type
-
     def __init__(self) -> None:
         pass
 
@@ -157,6 +151,16 @@ def value_in_support(self) -> float:
         """
         return 0.0
 
+    def get_param_proj(self, in_features: int) -> nn.Module:
+        r"""
+        Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
+        """
+        return ParameterProjection(
+            in_features=in_features,
+            args_dim=self.args_dim,
+            domain_map=LambdaLayer(self.domain_map),
+        )
+
     def domain_map(self, *args: torch.Tensor):
         r"""
         Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
@@ -165,6 +169,14 @@ def domain_map(self, *args: torch.Tensor):
         """
         raise NotImplementedError()
 
+    @classmethod
+    def squareplus(cls, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        Helper to map inputs to the positive orthant by applying the square-plus operation. Reference:
+        https://twitter.com/jon_barron/status/1387167648669048833
+        """
+        return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
+
 
 class StudentTOutput(DistributionOutput):
     args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
@@ -172,8 +184,8 @@ class StudentTOutput(DistributionOutput):
 
     @classmethod
     def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
-        scale = nn.functional.softplus(scale)
-        df = 2.0 + nn.functional.softplus(df)
+        scale = cls.squareplus(scale)
+        df = 2.0 + cls.squareplus(df)
         return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
 
     @property
@@ -181,6 +193,54 @@ def event_shape(self) -> Tuple:
         return ()
 
 
+class NormalOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
+    distr_cls: type = Normal
+
+    @classmethod
+    def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
+        scale = cls.squareplus(scale)
+        return loc.squeeze(-1), scale.squeeze(-1)
+
+    @property
+    def event_shape(self) -> Tuple:
+        return ()
+
+
+class NegativeBinomialOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
+    distr_cls: type = NegativeBinomial
+
+    @classmethod
+    def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor):
+        total_count = cls.squareplus(total_count)
+        return total_count.squeeze(-1), logits.squeeze(-1)
+
+    def _base_distribution(self, distr_args) -> Distribution:
+        total_count, logits = distr_args
+        return self.distr_cls(total_count=total_count, logits=logits)
+
+    # Overwrites the parent class method. We cannot scale using the affine
+    # transformation since negative binomial should return integers. Instead
+    # we scale the parameters.
+    def distribution(
+        self,
+        distr_args,
+        loc: Optional[torch.Tensor] = None,
+        scale: Optional[torch.Tensor] = None,
+    ) -> Distribution:
+        total_count, logits = distr_args
+
+        if scale is not None:
+            logits += scale.log()
+
+        return NegativeBinomial(total_count=total_count, logits=logits)
+
+    @property
+    def event_shape(self) -> Tuple:
+        return ()
+
+
 class FeatureEmbedder(nn.Module):
     def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
         super().__init__()
@@ -359,7 +419,7 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 @dataclass
 class Seq2SeqTSModelOutput(ModelOutput):
     """
-    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
+    Base class for model encoder's outputs that also contains pre-computed hidden states that can speed up sequential
     decoding.
 
     Args:
@@ -405,8 +465,11 @@ class Seq2SeqTSModelOutput(ModelOutput):
 
             Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
             self-attention heads.
-        scale
-        static_features
+        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Scaling values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to rescale to the original scale.
+        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+            Static features of each time series' in a batch which are copied to the covariates at inference time.
     """
 
     last_hidden_state: torch.FloatTensor = None
@@ -423,6 +486,59 @@ class Seq2SeqTSModelOutput(ModelOutput):
 
 @dataclass
 class Seq2SeqTSPredictionOutput(ModelOutput):
+    """
+    Base class for model's predictions outputs that also contain the loss as well parameters of the chosen
+    distribution.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_target` is provided):
+            Distributional loss.
+        params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`):
+            Parameters of the chosen distribution.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Scaling values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to rescale to the original scale.
+        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+            Static features of each time series' in a batch which are copied to the covariates at inference time.
+    """
+
     loss: Optional[torch.FloatTensor] = None
     params: Optional[Tuple[torch.FloatTensor]] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
@@ -1408,9 +1524,7 @@ def create_network_inputs(
         features = torch.cat((expanded_static_feat, time_feat), dim=-1)
 
         # sequence = torch.cat((prior_input, inputs), dim=1)
-        lagged_sequence = self.get_lagged_subsequences(
-            sequence=inputs, subsequences_length=subsequences_length
-        )
+        lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
 
         lags_shape = lagged_sequence.shape
         reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
@@ -1569,8 +1683,15 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         self.model = TimeSeriesTransformerModel(config)
         if config.distribution_output == "student_t":
             self.distribution_output = StudentTOutput()
-            self.param_proj = self.distribution_output.get_param_proj(self.model.config.d_model)
-            self.target_shape = self.distribution_output.event_shape
+        elif config.distribution_output == "normal":
+            self.distribution_output = NormalOutput()
+        elif config.distribution_output == "negative_binomial":
+            self.distribution_output = NegativeBinomialOutput()
+        else:
+            raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        self.param_proj = self.distribution_output.get_param_proj(self.model.config.d_model)
+        self.target_shape = self.distribution_output.event_shape
 
         if config.loss == "nll":
             self.loss = NegativeLogLikelihood()

From 1618126d06fa5a42585195163c6c995178b3b2e3 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 27 Sep 2022 16:48:17 +0200
Subject: [PATCH 117/164] Update
 src/transformers/models/time_series_transformer/configuration_time_series_transformer.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../configuration_time_series_transformer.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 3e5a42ac1bbe5..98c387c44acfb 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -52,7 +52,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             The size of the target variable which by default is 1 for univariate targets.
         scaling (`bool`, *optional* defaults to `True`):
             Whether to scale the input targets.
-        lags_seq (`list` of `int`  *optional* defaults to `[1, 2, 3, 4, 5, 6, 7]`):
+        lags_seq (`list[int]`, *optional*;, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
             The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
             5, 6, 7]`.
         num_time_features (`int`, *optional* defaults to 0):

From 70cfbce6fe9ef2a2887ebd8f1b02b227d6d9c3cf Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 27 Sep 2022 16:49:24 +0200
Subject: [PATCH 118/164] move to one line

---
 .../test_modeling_time_series_transformer.py               | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index bad206f5c766c..2b590e68040dc 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -166,12 +166,7 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 @require_torch
 class TimeSeriesTransformerModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (
-            TimeSeriesTransformerModel,
-            TimeSeriesTransformerForPrediction,
-        )
-        if is_torch_available()
-        else ()
+        (TimeSeriesTransformerModel, TimeSeriesTransformerForPrediction) if is_torch_available() else ()
     )
     all_generative_model_classes = (TimeSeriesTransformerForPrediction,) if is_torch_available() else ()
     is_encoder_decoder = True

From 678f2976d42130dd9373fbb735debd4eb3db342e Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 27 Sep 2022 15:20:43 +0000
Subject: [PATCH 119/164] Add docstrings

---
 .../en/model_doc/time_series_transformer.mdx  |   7 +
 .../modeling_time_series_transformer.py       | 127 +++++++++---------
 .../test_modeling_time_series_transformer.py  |   4 +-
 3 files changed, 72 insertions(+), 66 deletions(-)

diff --git a/docs/source/en/model_doc/time_series_transformer.mdx b/docs/source/en/model_doc/time_series_transformer.mdx
index c6bb60823a16b..a734dd6f596ff 100644
--- a/docs/source/en/model_doc/time_series_transformer.mdx
+++ b/docs/source/en/model_doc/time_series_transformer.mdx
@@ -12,6 +12,13 @@ specific language governing permissions and limitations under the License.
 
 # Time Series Transformer
 
+<Tip>
+
+This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
+breaking changes to fix it in the future. If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
+
+</Tip>
+
 ## Overview
 
 The Time Series Transformer model is a vanilla encoder-decoder Transformer for time series forecasting and classification.
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index cb86e37976820..4e33a842db053 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -38,12 +38,12 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "huggingface/tst-ett"
+_CHECKPOINT_FOR_DOC = "huggingface/time-series-transformer-tourism-monthly"
 _CONFIG_FOR_DOC = "TimeSeriesTransformerConfig"
 
 
 TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "huggingface/tst-ett",
+    "huggingface/time-series-transformer-tourism-monthly",
     # See all TimeSeriesTransformer models at https://huggingface.co/models?filter=time_series_transformer
 ]
 
@@ -932,31 +932,67 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
     Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`~TimeSeriesTransformerTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Provide for translation and summarization training. By default, the model will create this tensor by
-            shifting the `input_ids` to the right, following the paper.
-        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
-            be used by default.
-
-            If you want to change padding behavior, you should read
-            [`modeling_time_series_transformer._prepare_decoder_attention_mask`] and modify to your needs. See diagram
-            1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        past_target (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Past values of the time series, that serve as context in order to predict the future. These values may contain lags,
+            i.e. additional values from the past which are added in order to serve as "extra context". The `past_target` is what
+            the Transformer encoder gets as input (with optional additional features, such as `feat_static_cat`, `feat_static_real`,
+            `past_time_feat`).
+
+            See the demo notebook and code snippets for details.
+
+            Missing values need to be replaced with zeros.
+
+        past_observed_values (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `past_target` values were observed and which were missing. Mask values selected in `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros i).
+
+        past_time_feat (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
+            Optional additional features, which the model internally will add to `past_target`. These could be things like "month of year",
+            "day of the month", etc. encoded as vectors (for instance as Fourier features). These could also be so-called "age" features,
+            which basically help the model know "at which point in life" a time-series is. Age features have small values for distant past
+            time steps and increase monotonically the more we approach the current time step.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where the position encodings
+            are learned from scratch internally as parameters of the model, the Time Series Transformer requires to provide additional features.
+
+            The Time Series Transformer only learns additional embeddings for `feat_static_cat`.
+
+        feat_static_cat (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+            Optional static categorical features for which the model will learn an embedding vector, which it will add to the values
+            of the time series.
+
+            Static categorical features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static categorical feature is a time series ID.
+
+        feat_static_real (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+            Optional static real features which the model will add to the values of the time series.
+
+            Static real features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static real feature is promotion information.
+
+        future_target (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
+            Future values of the time series, that serve as labels for the model. The `future_target` is what the Transformer
+            needs to learn to output, given the `past_target`.
+
+            See the demo notebook and code snippets for details.
+
+            Missing values need to be replaced with zeros.
+
+        future_time_feat (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
+            Optional additional features, which the model internally will add to `future_target`. These could be things like "month of year",
+            "day of the month", etc. encoded as vectors (for instance as Fourier features). These could also be so-called "age" features,
+            which basically help the model know "at which point in life" a time-series is. Age features have small values for distant past
+            time steps and increase monotonically the more we approach the current time step.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where the position encodings
+            are learned from scratch internally as parameters of the model, the Time Series Transformer requires to provide additional features.
+
+            The Time Series Transformer only learns additional embeddings for `feat_static_cat`.
+
         head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
             Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
 
@@ -994,14 +1030,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             you can choose to directly pass an embedded representation. This is useful if you want more control over
             how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup
             matrix.
-        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
-            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
-            input (see `past_key_values`). This is useful if you want more control over how to convert
-            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
-
-            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
-            of `inputs_embeds`.
+
         use_cache (`bool`, *optional*):
             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
             `past_key_values`).
@@ -1016,34 +1045,6 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-TIME_SERIES_TRANSFORMER_STANDALONE_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`ProphetNetTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
 class TimeSeriesTransformerEncoder(TimeSeriesTransformerPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index bad206f5c766c..36f46c24786e2 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -14,13 +14,11 @@
 # limitations under the License.
 """ Testing suite for the PyTorch TimeSeriesTransformer model. """
 
-import copy
 import inspect
 import tempfile
 import unittest
 
-from transformers import MODEL_MAPPING, is_torch_available
-from transformers.models.auto import get_values
+from transformers import is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester

From 4d6bd2b1500993bf95e1be5ce058a38931b86d3a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 27 Sep 2022 17:52:46 +0200
Subject: [PATCH 120/164] Update
 src/transformers/models/time_series_transformer/configuration_time_series_transformer.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../configuration_time_series_transformer.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 98c387c44acfb..c2e303d9335f5 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -52,7 +52,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             The size of the target variable which by default is 1 for univariate targets.
         scaling (`bool`, *optional* defaults to `True`):
             Whether to scale the input targets.
-        lags_seq (`list[int]`, *optional*;, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
+        lags_seq (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
             The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
             5, 6, 7]`.
         num_time_features (`int`, *optional* defaults to 0):

From 717439192191a5cd4368566651b47cb8bf384e1c Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 27 Sep 2022 18:13:27 +0200
Subject: [PATCH 121/164] add try except for assert and raise

---
 .../modeling_time_series_transformer.py              | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index cb86e37976820..42b8a3e5807e2 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1456,10 +1456,14 @@ def get_lagged_subsequences(
         sequence_length = sequence.shape[1]
         indices = [lag - shift for lag in self.config.lags_seq]
 
-        assert max(indices) + subsequences_length <= sequence_length, (
-            f"lags cannot go further than history length, found lag {max(indices)} "
-            f"while history length is only {sequence_length}"
-        )
+        try:
+            assert max(indices) + subsequences_length <= sequence_length, (
+                f"lags cannot go further than history length, found lag {max(indices)} "
+                f"while history length is only {sequence_length}"
+            )
+        except AssertionError as e:
+            e.args += (max(indices), sequence_length)
+            raise
 
         lagged_values = []
         for lag_index in indices:

From 58d368bbc35c40cfdd52500d18452db3a68bc025 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 27 Sep 2022 18:18:51 +0200
Subject: [PATCH 122/164] try and raise exception

---
 .../modeling_time_series_transformer.py                  | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 42b8a3e5807e2..e55b2e2777980 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1509,7 +1509,14 @@ def create_network_inputs(
         inputs_length = (
             self._past_length + self.config.prediction_length if future_target is not None else self._past_length
         )
-        assert inputs.shape[1] == inputs_length
+        try:
+            assert (
+                inputs.shape[1] == inputs_length,
+                f"input length {inputs.shape[1]} and dynamic feature lengths {inputs_length} does not match",
+            )
+        except AssertionError as e:
+            e.args += (inputs.shape[1], inputs_length)
+            raise
 
         subsequences_length = (
             self.config.context_length + self.config.prediction_length

From 319011bbbbbf05b79fe3103792699d8adfdd3cd0 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 27 Sep 2022 18:25:59 +0200
Subject: [PATCH 123/164] fix the documentation formatting

---
 .../modeling_time_series_transformer.py       | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index e55b2e2777980..33630b1932b79 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1443,15 +1443,16 @@ def get_lagged_subsequences(
         self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
     ) -> torch.Tensor:
         """
-        Returns lagged subsequences of a given sequence. Parameters ---------- sequence : Tensor
-            the sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
-        subsequences_length : int
-            length of the subsequences to be extracted.
-        shift: int
-            shift the lags by this amount back.
-        Returns -------- lagged : Tensor
-            a tensor of shape (N, S, C, I), where S = subsequences_length and I = len(indices), containing lagged
-            subsequences. Specifically, lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
+        Returns lagged subsequences of a given sequence. Returns a tensor of shape (N, S, C, I),
+            where S = subsequences_length and I = len(indices), containing lagged subsequences. Specifically, lagged[i,
+            j, :, k] = sequence[i, -indices[k]-S+j, :].
+        Args:
+            sequence : Tensor
+                the sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
+            subsequences_length : int
+                length of the subsequences to be extracted.
+            shift: int
+                shift the lags by this amount back.
         """
         sequence_length = sequence.shape[1]
         indices = [lag - shift for lag in self.config.lags_seq]

From ecbf682e0a96569258207fd24ecff0dade8b1c14 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 27 Sep 2022 18:27:24 +0200
Subject: [PATCH 124/164] fix assert call

---
 .../modeling_time_series_transformer.py                        | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 33630b1932b79..0f5e8126df9a6 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1511,8 +1511,7 @@ def create_network_inputs(
             self._past_length + self.config.prediction_length if future_target is not None else self._past_length
         )
         try:
-            assert (
-                inputs.shape[1] == inputs_length,
+            assert inputs.shape[1] == inputs_length, (
                 f"input length {inputs.shape[1]} and dynamic feature lengths {inputs_length} does not match",
             )
         except AssertionError as e:

From 74fd969c6b887d312c325f54ec7bdb2e4a5d5a09 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 27 Sep 2022 18:30:03 +0200
Subject: [PATCH 125/164] fix docstring formatting

---
 .../modeling_time_series_transformer.py            | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 0f5e8126df9a6..ad09570cfdfdf 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1274,17 +1274,21 @@ def forward(
 
                 If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                 that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
-                all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
-                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
-                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
-                control over how to convert `input_ids` indices into associated vectors than the model's internal
-                embedding lookup matrix.
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
+
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more detail.
+
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """

From 21d2ba9c4481af9ec948c98bf9d6ebacd5e0d62d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 27 Sep 2022 18:32:51 +0200
Subject: [PATCH 126/164] removed input_ids from DOCSTRING

---
 .../modeling_time_series_transformer.py          | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index ad09570cfdfdf..ac5f2e0fa5fca 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -932,14 +932,6 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
     Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`~TimeSeriesTransformerTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -1018,14 +1010,6 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 TIME_SERIES_TRANSFORMER_STANDALONE_INPUTS_DOCSTRING = r"""
     Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`ProphetNetTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 

From d995f4bded8c78a1f47eed4bfcf043a35ea5b510 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 27 Sep 2022 16:44:09 +0000
Subject: [PATCH 127/164] Update input docstring

---
 .../modeling_time_series_transformer.py       | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 4e33a842db053..6565d7e01a8fc 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -932,58 +932,58 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
     Args:
-        past_target (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
             Past values of the time series, that serve as context in order to predict the future. These values may contain lags,
-            i.e. additional values from the past which are added in order to serve as "extra context". The `past_target` is what
-            the Transformer encoder gets as input (with optional additional features, such as `feat_static_cat`, `feat_static_real`,
-            `past_time_feat`).
+            i.e. additional values from the past which are added in order to serve as "extra context". The `past_values` is what
+            the Transformer encoder gets as input (with optional additional features, such as `static_categorical_features`, `static_real_features`,
+            `past_time_featuresures`).
 
             See the demo notebook and code snippets for details.
 
             Missing values need to be replaced with zeros.
 
-        past_observed_values (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Boolean mask to indicate which `past_target` values were observed and which were missing. Mask values selected in `[0, 1]`:
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `past_values` values were observed and which were missing. Mask values selected in `[0, 1]`:
 
             - 1 for values that are **observed**,
-            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros i).
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
 
-        past_time_feat (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
-            Optional additional features, which the model internally will add to `past_target`. These could be things like "month of year",
+        past_time_featuresures (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
+            Optional time features, which the model internally will add to `past_values`. These could be things like "month of year",
             "day of the month", etc. encoded as vectors (for instance as Fourier features). These could also be so-called "age" features,
             which basically help the model know "at which point in life" a time-series is. Age features have small values for distant past
             time steps and increase monotonically the more we approach the current time step.
 
             These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where the position encodings
-            are learned from scratch internally as parameters of the model, the Time Series Transformer requires to provide additional features.
+            are learned from scratch internally as parameters of the model, the Time Series Transformer requires to provide additional time features.
 
-            The Time Series Transformer only learns additional embeddings for `feat_static_cat`.
+            The Time Series Transformer only learns additional embeddings for `static_categorical_features`.
 
-        feat_static_cat (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
-            Optional static categorical features for which the model will learn an embedding vector, which it will add to the values
+        static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+            Optional static categorical features for which the model will learn an embedding, which it will add to the values
             of the time series.
 
             Static categorical features are features which have the same value for all time steps (static over time).
 
             A typical example of a static categorical feature is a time series ID.
 
-        feat_static_real (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+        static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
             Optional static real features which the model will add to the values of the time series.
 
             Static real features are features which have the same value for all time steps (static over time).
 
             A typical example of a static real feature is promotion information.
 
-        future_target (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
-            Future values of the time series, that serve as labels for the model. The `future_target` is what the Transformer
-            needs to learn to output, given the `past_target`.
+        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
+            Future values of the time series, that serve as labels for the model. The `future_values` is what the Transformer
+            needs to learn to output, given the `past_values`.
 
             See the demo notebook and code snippets for details.
 
             Missing values need to be replaced with zeros.
 
-        future_time_feat (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
-            Optional additional features, which the model internally will add to `future_target`. These could be things like "month of year",
+        future_time_featuresures (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
+            Optional time features, which the model internally will add to `future_values`. These could be things like "month of year",
             "day of the month", etc. encoded as vectors (for instance as Fourier features). These could also be so-called "age" features,
             which basically help the model know "at which point in life" a time-series is. Age features have small values for distant past
             time steps and increase monotonically the more we approach the current time step.
@@ -991,7 +991,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where the position encodings
             are learned from scratch internally as parameters of the model, the Time Series Transformer requires to provide additional features.
 
-            The Time Series Transformer only learns additional embeddings for `feat_static_cat`.
+            The Time Series Transformer only learns additional embeddings for `static_categorical_features`.
 
         head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
             Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:

From 9d8914f1a14825b4f9aedca33435f702422eee01 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 27 Sep 2022 17:16:10 +0000
Subject: [PATCH 128/164] Improve variable names

---
 .../configuration_time_series_transformer.py  |  28 +--
 .../modeling_time_series_transformer.py       | 186 +++++++++---------
 .../test_modeling_time_series_transformer.py  |  50 ++---
 3 files changed, 132 insertions(+), 132 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 3e5a42ac1bbe5..25c5ac0849c32 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -57,16 +57,16 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             5, 6, 7]`.
         num_time_features (`int`, *optional* defaults to 0):
             The number of time features in the input time series.
-        num_feat_dynamic_real (`int`, *optional* defaults to 0):
+        num_dynamic_real_features (`int`, *optional* defaults to 0):
             The number of dynamic real valued features.
-        num_feat_static_cat (`int`, *optional* defaults to 0):
+        num_static_categorical_features (`int`, *optional* defaults to 0):
             The number of static categorical features.
-        num_feat_static_real (`int`, *optional* defaults to 0):
+        num_static_real_features (`int`, *optional* defaults to 0):
             The number of static real valued features.
         cardinality (`list` of `int`, *optional*):
-            The cardinality of the categorical features. Cannot be `None` if `num_feat_static_cat` is `> 0`.
+            The cardinality of the categorical features. Cannot be `None` if `num_static_categorical_features` is `> 0`.
         embedding_dimension (`list` of `int`, *optional*):
-            The dimension of the embedding for the categorical features. Cannot be `None` if `num_feat_static_cat` is
+            The dimension of the embedding for the categorical features. Cannot be `None` if `num_static_categorical_features` is
             `> 0`.
         encoder_layers (`int`, *optional*, defaults to `2`):
             Number of encoder layers.
@@ -130,9 +130,9 @@ def __init__(
         loss: str = "nll",
         lags_seq: List[int] = [1, 2, 3, 4, 5, 6, 7],
         scaling: bool = True,
-        num_feat_dynamic_real: int = 0,
-        num_feat_static_cat: int = 0,
-        num_feat_static_real: int = 0,
+        num_dynamic_real_features: int = 0,
+        num_static_categorical_features: int = 0,
+        num_static_real_features: int = 0,
         num_time_features: int = 0,
         cardinality: Optional[List[int]] = None,
         embedding_dimension: Optional[List[int]] = None,
@@ -163,10 +163,10 @@ def __init__(
         self.num_time_features = num_time_features
         self.lags_seq = lags_seq
         self.scaling = scaling
-        self.num_feat_dynamic_real = num_feat_dynamic_real
-        self.num_feat_static_real = num_feat_static_real
-        self.num_feat_static_cat = num_feat_static_cat
-        self.cardinality = cardinality if cardinality and num_feat_static_cat > 0 else [1]
+        self.num_dynamic_real_features = num_dynamic_real_features
+        self.num_static_real_features = num_static_real_features
+        self.num_static_categorical_features = num_static_categorical_features
+        self.cardinality = cardinality if cardinality and num_static_categorical_features > 0 else [1]
         self.embedding_dimension = embedding_dimension or [min(50, (cat + 1) // 2) for cat in self.cardinality]
         self.num_parallel_samples = num_parallel_samples
 
@@ -199,8 +199,8 @@ def __init__(
     def _number_of_features(self) -> int:
         return (
             sum(self.embedding_dimension)
-            + self.num_feat_dynamic_real
+            + self.num_dynamic_real_features
             + self.num_time_features
-            + max(1, self.num_feat_static_real)  # there is at least one dummy static real feature
+            + max(1, self.num_static_real_features)  # there is at least one dummy static real feature
             + 1  # the log(scale)
         )
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 6565d7e01a8fc..91ad4a4fb7bdd 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -491,7 +491,7 @@ class Seq2SeqTSPredictionOutput(ModelOutput):
     distribution.
 
     Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_target` is provided):
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_values` is provided):
             Distributional loss.
         params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`):
             Parameters of the chosen distribution.
@@ -895,7 +895,7 @@ def forward(
 class TimeSeriesTransformerPreTrainedModel(PreTrainedModel):
     config_class = TimeSeriesTransformerConfig
     base_model_prefix = "model"
-    main_input_name = "past_target"
+    main_input_name = "past_values"
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
@@ -936,7 +936,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Past values of the time series, that serve as context in order to predict the future. These values may contain lags,
             i.e. additional values from the past which are added in order to serve as "extra context". The `past_values` is what
             the Transformer encoder gets as input (with optional additional features, such as `static_categorical_features`, `static_real_features`,
-            `past_time_featuresures`).
+            `past_time_featuresuresures`).
 
             See the demo notebook and code snippets for details.
 
@@ -948,7 +948,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             - 1 for values that are **observed**,
             - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
 
-        past_time_featuresures (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
+        past_time_featuresuresures (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
             Optional time features, which the model internally will add to `past_values`. These could be things like "month of year",
             "day of the month", etc. encoded as vectors (for instance as Fourier features). These could also be so-called "age" features,
             which basically help the model know "at which point in life" a time-series is. Age features have small values for distant past
@@ -982,7 +982,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             Missing values need to be replaced with zeros.
 
-        future_time_featuresures (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
+        future_time_featuresuresures (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
             Optional time features, which the model internally will add to `future_values`. These could be things like "month of year",
             "day of the month", etc. encoded as vectors (for instance as Fourier features). These could also be so-called "age" features,
             which basically help the model know "at which point in life" a time-series is. Age features have small values for distant past
@@ -1471,53 +1471,53 @@ def get_lagged_subsequences(
 
     def create_network_inputs(
         self,
-        feat_static_cat: torch.Tensor,
-        feat_static_real: torch.Tensor,
-        past_time_feat: torch.Tensor,
-        past_target: torch.Tensor,
-        past_observed_values: torch.Tensor,
-        future_time_feat: Optional[torch.Tensor] = None,
-        future_target: Optional[torch.Tensor] = None,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_values: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        future_time_features: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
     ):
         # time feature
         time_feat = (
             torch.cat(
                 (
-                    past_time_feat[:, self._past_length - self.config.context_length :, ...],
-                    future_time_feat,
+                    past_time_features[:, self._past_length - self.config.context_length :, ...],
+                    future_time_features,
                 ),
                 dim=1,
             )
-            if future_target is not None
-            else past_time_feat[:, self._past_length - self.config.context_length :, ...]
+            if future_values is not None
+            else past_time_features[:, self._past_length - self.config.context_length :, ...]
         )
 
         # target
-        context = past_target[:, -self.config.context_length :]
-        observed_context = past_observed_values[:, -self.config.context_length :]
+        context = past_values[:, -self.config.context_length :]
+        observed_context = past_observed_mask[:, -self.config.context_length :]
         _, scale = self.scaler(context, observed_context)
 
         inputs = (
-            torch.cat((past_target, future_target), dim=1) / scale
-            if future_target is not None
-            else past_target / scale
+            torch.cat((past_values, future_values), dim=1) / scale
+            if future_values is not None
+            else past_values / scale
         )
 
         inputs_length = (
-            self._past_length + self.config.prediction_length if future_target is not None else self._past_length
+            self._past_length + self.config.prediction_length if future_values is not None else self._past_length
         )
         assert inputs.shape[1] == inputs_length
 
         subsequences_length = (
             self.config.context_length + self.config.prediction_length
-            if future_target is not None
+            if future_values is not None
             else self.config.context_length
         )
 
         # embeddings
-        embedded_cat = self.embedder(feat_static_cat)
+        embedded_cat = self.embedder(static_categorical_features)
         static_feat = torch.cat(
-            (embedded_cat, feat_static_real, scale.log()),
+            (embedded_cat, static_real_features, scale.log()),
             dim=1,
         )
         expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
@@ -1554,13 +1554,13 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        past_target: torch.Tensor,
-        feat_static_cat: torch.Tensor,
-        feat_static_real: torch.Tensor,
-        past_time_feat: torch.Tensor,
-        past_observed_values: torch.Tensor,
-        future_time_feat: Optional[torch.Tensor] = None,
-        future_target: Optional[torch.Tensor] = None,
+        past_values: torch.Tensor,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        future_time_features: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
@@ -1594,15 +1594,15 @@ def forward(
         >>> past_length = context_length + max(lags_seq)
 
         >>> # encoder inputs
-        >>> inputs["feat_static_cat"] = ids_tensor([batch_size, 1], cardinality)
-        >>> inputs["feat_static_real"] = torch.randn([batch_size, 1])
-        >>> inputs["past_time_feat"] = torch.randn([batch_size, past_length, num_time_features])
-        >>> inputs["past_target"] = torch.randn([batch_size, past_length])
-        >>> inputs["past_observed_values"] = torch.randn([batch_size, past_length])
+        >>> inputs["static_categorical_features"] = ids_tensor([batch_size, 1], cardinality)
+        >>> inputs["static_real_features"] = torch.randn([batch_size, 1])
+        >>> inputs["past_time_features"] = torch.randn([batch_size, past_length, num_time_features])
+        >>> inputs["past_values"] = torch.randn([batch_size, past_length])
+        >>> inputs["past_observed_mask"] = torch.randn([batch_size, past_length])
 
         >>> # decoder inputs
-        >>> inputs["future_time_feat"] = torch.randn([batch_size, prediction_length, num_time_features])
-        >>> inputs["future_target"] = torch.randn([batch_size, prediction_length])
+        >>> inputs["future_time_features"] = torch.randn([batch_size, prediction_length, num_time_features])
+        >>> inputs["future_values"] = torch.randn([batch_size, prediction_length])
 
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
@@ -1615,13 +1615,13 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         transformer_inputs, scale, static_feat = self.create_network_inputs(
-            feat_static_cat=feat_static_cat,
-            feat_static_real=feat_static_real,
-            past_time_feat=past_time_feat,
-            past_target=past_target,
-            past_observed_values=past_observed_values,
-            future_time_feat=future_time_feat,
-            future_target=future_target,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            past_time_features=past_time_features,
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            future_time_features=future_time_features,
+            future_values=future_values,
         )
 
         if encoder_outputs is None:
@@ -1720,14 +1720,14 @@ def output_distribution(self, params, scale=None, trailing_n=None) -> torch.dist
     @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        past_target: torch.Tensor,
-        feat_static_cat: torch.Tensor,
-        feat_static_real: torch.Tensor,
-        past_time_feat: torch.Tensor,
-        past_observed_values: torch.Tensor,
-        future_time_feat: Optional[torch.Tensor] = None,
-        future_target: Optional[torch.Tensor] = None,
-        future_observed_values: Optional[torch.Tensor] = None,
+        past_values: torch.Tensor,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        future_time_features: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        future_observed_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[List[torch.FloatTensor]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1755,32 +1755,32 @@ def forward(
         >>> past_length = context_length + max(lags_seq)
 
         >>> # encoder inputs
-        >>> inputs["feat_static_cat"] = ids_tensor([batch_size, 1], cardinality)
-        >>> inputs["feat_static_real"] = torch.randn([batch_size, 1])
-        >>> inputs["past_time_feat"] = torch.randn([batch_size, past_length, num_time_features])
-        >>> inputs["past_target"] = torch.randn([batch_size, past_length])
-        >>> inputs["past_observed_values"] = torch.randn([batch_size, past_length])
+        >>> inputs["static_categorical_features"] = ids_tensor([batch_size, 1], cardinality)
+        >>> inputs["static_real_features"] = torch.randn([batch_size, 1])
+        >>> inputs["past_time_features"] = torch.randn([batch_size, past_length, num_time_features])
+        >>> inputs["past_values"] = torch.randn([batch_size, past_length])
+        >>> inputs["past_observed_mask"] = torch.randn([batch_size, past_length])
 
         >>> # decoder inputs
-        >>> inputs["future_time_feat"] = torch.randn([batch_size, prediction_length, num_time_features])
-        >>> inputs["future_target"] = torch.randn([batch_size, prediction_length])
+        >>> inputs["future_time_features"] = torch.randn([batch_size, prediction_length, num_time_features])
+        >>> inputs["future_values"] = torch.randn([batch_size, prediction_length])
 
         >>> outputs = model(**inputs)
         >>> loss = outputs.loss
         ```"""
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if future_target is not None:
+        if future_values is not None:
             use_cache = False
 
         outputs = self.model(
-            past_target=past_target,
-            feat_static_cat=feat_static_cat,
-            feat_static_real=feat_static_real,
-            past_time_feat=past_time_feat,
-            past_observed_values=past_observed_values,
-            future_time_feat=future_time_feat,
-            future_target=future_target,
+            past_values=past_values,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            past_time_features=past_time_features,
+            past_observed_mask=past_observed_mask,
+            future_time_features=future_time_features,
+            future_values=future_values,
             encoder_outputs=encoder_outputs,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -1790,19 +1790,19 @@ def forward(
 
         prediction_loss = None
         params = None
-        if future_target is not None:
+        if future_values is not None:
             params = self.output_params(outputs[0])  # outputs.last_hidden_state
             distr = self.output_distribution(params, outputs[-2])  # outputs.scale
 
-            loss = self.loss(distr, future_target)
+            loss = self.loss(distr, future_values)
 
-            if future_observed_values is None:
-                future_observed_values = torch.ones_like(future_target)
+            if future_observed_mask is None:
+                future_observed_mask = torch.ones_like(future_values)
 
             if len(self.target_shape) == 0:
-                loss_weights = future_observed_values
+                loss_weights = future_observed_mask
             else:
-                loss_weights = future_observed_values.min(dim=-1, keepdim=False)
+                loss_weights = future_observed_mask.min(dim=-1, keepdim=False)
 
             prediction_loss = weighted_average(loss, weights=loss_weights)
 
@@ -1827,23 +1827,23 @@ def forward(
     @torch.no_grad()
     def generate(
         self,
-        feat_static_cat: torch.Tensor,
-        feat_static_real: torch.Tensor,
-        past_time_feat: torch.Tensor,
-        past_target: torch.Tensor,
-        past_observed_values: torch.Tensor,
-        future_time_feat: Optional[torch.Tensor],
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_values: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        future_time_features: Optional[torch.Tensor],
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> torch.Tensor:
         outputs = self(
-            feat_static_cat=feat_static_cat,
-            feat_static_real=feat_static_real,
-            past_time_feat=past_time_feat,
-            past_target=past_target,
-            past_observed_values=past_observed_values,
-            future_time_feat=future_time_feat,
-            future_target=None,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            past_time_features=past_time_features,
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            future_time_features=future_time_features,
+            future_values=None,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=True,
@@ -1858,10 +1858,10 @@ def generate(
         num_parallel_samples = self.config.num_parallel_samples
         repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
 
-        repeated_past_target = past_target.repeat_interleave(repeats=num_parallel_samples, dim=0) / repeated_scale
+        repeated_past_values = past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) / repeated_scale
 
-        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_feat.shape[1], -1)
-        features = torch.cat((expanded_static_feat, future_time_feat), dim=-1)
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_features.shape[1], -1)
+        features = torch.cat((expanded_static_feat, future_time_features), dim=-1)
         repeated_features = features.repeat_interleave(repeats=num_parallel_samples, dim=0)
 
         repeated_enc_last_hidden = enc_last_hidden.repeat_interleave(repeats=num_parallel_samples, dim=0)
@@ -1871,7 +1871,7 @@ def generate(
         # greedy decoding
         for k in range(self.config.prediction_length):
             lagged_sequence = self.model.get_lagged_subsequences(
-                sequence=repeated_past_target,
+                sequence=repeated_past_values,
                 subsequences_length=1 + k,
                 shift=1,
             )
@@ -1888,7 +1888,7 @@ def generate(
             distr = self.output_distribution(params, scale=repeated_scale)
             next_sample = distr.sample()
 
-            repeated_past_target = torch.cat((repeated_past_target, next_sample / repeated_scale), dim=1)
+            repeated_past_values = torch.cat((repeated_past_values, next_sample / repeated_scale), dim=1)
             future_samples.append(next_sample)
 
         concat_future_samples = torch.cat(future_samples, dim=1)
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 36f46c24786e2..e23d9d3076587 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -84,16 +84,16 @@ def __init__(
     def prepare_config_and_inputs(self):
         _past_length = self.context_length + max(self.lags_seq)
 
-        feat_static_cat = ids_tensor([self.batch_size, 1], self.cardinality)
-        feat_static_real = floats_tensor([self.batch_size, 1])
+        static_categorical_features = ids_tensor([self.batch_size, 1], self.cardinality)
+        static_real_features = floats_tensor([self.batch_size, 1])
 
-        past_time_feat = floats_tensor([self.batch_size, _past_length, self.num_time_features])
-        past_target = floats_tensor([self.batch_size, _past_length])
-        past_observed_values = floats_tensor([self.batch_size, _past_length])
+        past_time_features = floats_tensor([self.batch_size, _past_length, self.num_time_features])
+        past_values = floats_tensor([self.batch_size, _past_length])
+        past_observed_mask = floats_tensor([self.batch_size, _past_length])
 
         # decoder inputs
-        future_time_feat = floats_tensor([self.batch_size, self.prediction_length, self.num_time_features])
-        future_target = floats_tensor([self.batch_size, self.prediction_length])
+        future_time_features = floats_tensor([self.batch_size, self.prediction_length, self.num_time_features])
+        future_values = floats_tensor([self.batch_size, self.prediction_length])
 
         config = TimeSeriesTransformerConfig(
             encoder_layers=self.num_hidden_layers,
@@ -108,19 +108,19 @@ def prepare_config_and_inputs(self):
             context_length=self.context_length,
             lags_seq=self.lags_seq,
             num_time_features=self.num_time_features,
-            num_feat_static_cat=1,
+            num_static_categorical_features=1,
             cardinality=[self.cardinality],
             embedding_dimension=[self.embedding_dimension],
         )
 
         inputs_dict = {
-            "past_target": past_target,
-            "feat_static_cat": feat_static_cat,
-            "feat_static_real": feat_static_real,
-            "past_time_feat": past_time_feat,
-            "future_time_feat": future_time_feat,
-            "past_observed_values": past_observed_values,
-            "future_target": future_target,
+            "past_values": past_values,
+            "static_categorical_features": static_categorical_features,
+            "static_real_features": static_real_features,
+            "past_time_features": past_time_features,
+            "past_observed_mask": past_observed_mask,
+            "future_time_features": future_time_features,
+            "future_values": future_values,
         }
         return config, inputs_dict
 
@@ -205,7 +205,7 @@ def test_encoder_decoder_model_standalone(self):
     def test_resize_tokens_embeddings(self):
         pass
 
-    # # Input is 'feat_static_cat' not 'input_ids'
+    # # Input is 'static_categorical_features' not 'input_ids'
     def test_model_main_input_name(self):
         model_signature = inspect.signature(getattr(TimeSeriesTransformerModel, "forward"))
         # The main input is the name of the argument after `self`
@@ -222,25 +222,25 @@ def test_forward_signature(self):
             arg_names = [*signature.parameters.keys()]
 
             expected_arg_names = [
-                "past_target",
-                "feat_static_cat",
-                "feat_static_real",
-                "past_time_feat",
-                "past_observed_values",
-                "future_time_feat",
-                "future_target",
+                "past_values",
+                "static_categorical_features",
+                "static_real_features",
+                "past_time_features",
+                "past_observed_mask",
+                "future_time_features",
+                "future_values",
             ]
 
             expected_arg_names.extend(
                 [
-                    "future_observed_values",
+                    "future_observed_mask",
                     "encoder_outputs",
                     "use_cache",
                     "output_attentions",
                     "output_hidden_states",
                     "return_dict",
                 ]
-                if "future_observed_values" in arg_names
+                if "future_observed_mask" in arg_names
                 else [
                     "attention_mask",
                     "decoder_attention_mask",

From bbf49a3542dc7fab7b9678254b6969fc337dfd7b Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 28 Sep 2022 08:24:35 +0000
Subject: [PATCH 129/164] Update order of inputs

---
 .../modeling_time_series_transformer.py       | 38 +++++++++----------
 .../test_modeling_time_series_transformer.py  |  6 +--
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 91ad4a4fb7bdd..cfe9384c5b015 100755
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1471,13 +1471,13 @@ def get_lagged_subsequences(
 
     def create_network_inputs(
         self,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
-        past_time_features: torch.Tensor,
         past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
         past_observed_mask: torch.Tensor,
-        future_time_features: Optional[torch.Tensor] = None,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
         future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
     ):
         # time feature
         time_feat = (
@@ -1555,12 +1555,12 @@ def get_decoder(self):
     def forward(
         self,
         past_values: torch.Tensor,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
         past_time_features: torch.Tensor,
         past_observed_mask: torch.Tensor,
-        future_time_features: Optional[torch.Tensor] = None,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
         future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
@@ -1615,13 +1615,13 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         transformer_inputs, scale, static_feat = self.create_network_inputs(
-            static_categorical_features=static_categorical_features,
-            static_real_features=static_real_features,
-            past_time_features=past_time_features,
             past_values=past_values,
+            past_time_features=past_time_features,
             past_observed_mask=past_observed_mask,
-            future_time_features=future_time_features,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
             future_values=future_values,
+            future_time_features=future_time_features,
         )
 
         if encoder_outputs is None:
@@ -1721,12 +1721,12 @@ def output_distribution(self, params, scale=None, trailing_n=None) -> torch.dist
     def forward(
         self,
         past_values: torch.Tensor,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
         past_time_features: torch.Tensor,
         past_observed_mask: torch.Tensor,
-        future_time_features: Optional[torch.Tensor] = None,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
         future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
         future_observed_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[List[torch.FloatTensor]] = None,
         use_cache: Optional[bool] = None,
@@ -1775,12 +1775,12 @@ def forward(
 
         outputs = self.model(
             past_values=past_values,
-            static_categorical_features=static_categorical_features,
-            static_real_features=static_real_features,
             past_time_features=past_time_features,
             past_observed_mask=past_observed_mask,
-            future_time_features=future_time_features,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
             future_values=future_values,
+            future_time_features=future_time_features,
             encoder_outputs=encoder_outputs,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -1792,9 +1792,9 @@ def forward(
         params = None
         if future_values is not None:
             params = self.output_params(outputs[0])  # outputs.last_hidden_state
-            distr = self.output_distribution(params, outputs[-2])  # outputs.scale
+            distribution = self.output_distribution(params, outputs[-2])  # outputs.scale
 
-            loss = self.loss(distr, future_values)
+            loss = self.loss(distribution, future_values)
 
             if future_observed_mask is None:
                 future_observed_mask = torch.ones_like(future_values)
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index e23d9d3076587..b97b4cb472e2f 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -223,12 +223,12 @@ def test_forward_signature(self):
 
             expected_arg_names = [
                 "past_values",
-                "static_categorical_features",
-                "static_real_features",
                 "past_time_features",
                 "past_observed_mask",
-                "future_time_features",
+                "static_categorical_features",
+                "static_real_features",
                 "future_values",
+                "future_time_features",
             ]
 
             expected_arg_names.extend(

From f44aad502bb672ef5163e40fcf9abdb5cdcfd569 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 28 Sep 2022 08:57:42 +0000
Subject: [PATCH 130/164] Improve configuration

---
 .../configuration_time_series_transformer.py  | 93 +++++++++++--------
 .../modeling_time_series_transformer.py       | 15 +--
 .../test_modeling_time_series_transformer.py  |  8 +-
 3 files changed, 65 insertions(+), 51 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 22204b67cc7c9..58c89ba4ee6e5 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Time Series Transformer model configuration"""
+
 from typing import List, Optional
 
 from ...configuration_utils import PretrainedConfig
@@ -22,7 +23,7 @@
 logger = logging.get_logger(__name__)
 
 TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "huggingface/tst-ett": "https://huggingface.co/huggingface/tst-ett/resolve/main/config.json",
+    "huggingface/time-series-transformer-tourism-monthly": "https://huggingface.co/huggingface/time-series-transformer-tourism-monthly/resolve/main/config.json",
     # See all TimeSeriesTransformer models at https://huggingface.co/models?filter=time_series_transformer
 }
 
@@ -32,70 +33,72 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`TimeSeriesTransformerModel`]. It is used to
     instantiate a Time Series Transformer model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the Time Series
-    Transformer [huggingface/tst-ett](https://huggingface.co/huggingface/tst-ett) architecture.
+    Transformer [huggingface/time-series-transformer-tourism-monthly](https://huggingface.co/huggingface/time-series-transformer-tourism-monthly)
+    architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
         prediction_length (`int`):
-            The prediction horizon for the model.
-        context_length (`int`, *optional*):
+            The prediction length for the decoder. In other words, the prediction horizion of the model.
+        context_length (`int`, *optional*, defaults to `prediction_length`):
             The context length for the encoder. If `None`, the context length will be the same as the
             `prediction_length`.
-        distribution_output (`string`, *optional* defaults to `"student_t"`):
-            The distribution emission head for the model.
-        loss (`string`, *optional* defaults to `"nll"`):
+        distribution_output (`string`, *optional*, defaults to `"student_t"`):
+            The distribution emission head for the model. Could be either "student_t", "normal" or "negative_binomial".
+        loss (`string`, *optional*, defaults to `"nll"`):
             The loss function for the model corresponding to the `distribution_output` head. For parametric
-            distributions it is negative log likelihood.
-        input_size (`int`, *optional* defaults to 1):
-            The size of the target variable which by default is 1 for univariate targets.
+            distributions it is the negative log likelihood (nll) - which currently is the only supported one.
+        input_size (`int`, *optional*, defaults to 1):
+            The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of multivarate targets. 
         scaling (`bool`, *optional* defaults to `True`):
             Whether to scale the input targets.
-        lags_seq (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
+        lags_sequence (`list[int]`, *optional*, defaults to [1, 2, 3, 4, 5, 6, 7]):
             The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
             5, 6, 7]`.
-        num_time_features (`int`, *optional* defaults to 0):
+        num_time_features (`int`, *optional*, defaults to 0):
             The number of time features in the input time series.
-        num_dynamic_real_features (`int`, *optional* defaults to 0):
+        num_dynamic_real_features (`int`, *optional*, defaults to 0):
             The number of dynamic real valued features.
-        num_static_categorical_features (`int`, *optional* defaults to 0):
+        num_static_categorical_features (`int`, *optional*, defaults to 0):
             The number of static categorical features.
-        num_static_real_features (`int`, *optional* defaults to 0):
+        num_static_real_features (`int`, *optional*, defaults to 0):
             The number of static real valued features.
-        cardinality (`list` of `int`, *optional*):
-            The cardinality of the categorical features. Cannot be `None` if `num_static_categorical_features` is `> 0`.
-        embedding_dimension (`list` of `int`, *optional*):
-            The dimension of the embedding for the categorical features. Cannot be `None` if `num_static_categorical_features` is
-            `> 0`.
-        encoder_layers (`int`, *optional*, defaults to `2`):
+        cardinality (`list[int]`, *optional*):
+            The cardinality (number of different values) for each of the static categorical features. Should be a list of integers, having the same
+            length as `num_static_categorical_features`. Cannot be `None` if `num_static_categorical_features` is > 0.
+        embedding_dimension (`list[int]`, *optional*):
+            The dimension of the embedding for each of the static categorical features. Should be a list of integers, having the same
+            length as `num_static_categorical_features`. Cannot be `None` if `num_static_categorical_features` is > 0.
+        encoder_layers (`int`, *optional*, defaults to 2):
             Number of encoder layers.
-        decoder_layers (`int`, *optional*, defaults to `2`):
+        decoder_layers (`int`, *optional*, defaults to 2):
             Number of decoder layers.
-        encoder_attention_heads (`int`, *optional*, defaults to `2`):
+        encoder_attention_heads (`int`, *optional*, defaults to 2):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (`int`, *optional*, defaults to `2`):
+        decoder_attention_heads (`int`, *optional*, defaults to 2):
             Number of attention heads for each attention layer in the Transformer decoder.
-        encoder_ffn_dim (`int`, *optional*, defaults to `32`):
+        encoder_ffn_dim (`int`, *optional*, defaults to 32):
             Dimension of the "intermediate" (often named feed-forward) layer in encoder.
-        decoder_ffn_dim (`int`, *optional*, defaults to `32`):
+        decoder_ffn_dim (`int`, *optional*, defaults to 32):
             Dimension of the "intermediate" (often named feed-forward) layer in decoder.
         activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and
             `"relu"` are supported.
-        dropout (`float`, *optional*, defaults to `0.1`):
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the encoder, and decoder.
-        encoder_layerdrop (`float`, *optional*, defaults to `0.1`):
+        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
             The dropout probability for the attention and fully connected layers for each encoder layer.
-        decoder_layerdrop (`float`, *optional*, defaults to `0.1`):
+        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
             The dropout probability for the attention and fully connected layers for each decoder layer.
-        attention_dropout (`float`, *optional*, defaults to `0.1`):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the attention probabilities.
-        activation_dropout (`float`, *optional*, defaults to `0.1`):
+        activation_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability used between the two layers of the feed-forward networks.
-        num_parallel_samples (`int`, *optional*, defaults to `100`):
+        num_parallel_samples (`int`, *optional*, defaults to 100):
             The number of samples to generate in parallel for each time step of inference.
-        init_std (`float`, *optional*, defaults to `0.02`):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated normal weight initialization distribution.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
@@ -105,10 +108,10 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
     ```python
     >>> from transformers import TimeSeriesTransformerConfig, TimeSeriesTransformerModel
 
-    >>> # Initializing a Time Series Transformer huggingface/tst-ett style configuration
+    >>> # Initializing a default Time Series Transformer configuration
     >>> configuration = TimeSeriesTransformerConfig()
 
-    >>> # Initializing a model from the huggingface/tst-ett style configuration
+    >>> # Randomly initializing a model from the configuration
     >>> model = TimeSeriesTransformerModel(configuration)
 
     >>> # Accessing the model configuration
@@ -128,7 +131,7 @@ def __init__(
         context_length: Optional[int] = None,
         distribution_output: str = "student_t",
         loss: str = "nll",
-        lags_seq: List[int] = [1, 2, 3, 4, 5, 6, 7],
+        lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
         scaling: bool = True,
         num_dynamic_real_features: int = 0,
         num_static_categorical_features: int = 0,
@@ -161,17 +164,27 @@ def __init__(
         self.loss = loss
         self.input_size = input_size
         self.num_time_features = num_time_features
-        self.lags_seq = lags_seq
+        self.lags_sequence = lags_sequence
         self.scaling = scaling
         self.num_dynamic_real_features = num_dynamic_real_features
         self.num_static_real_features = num_static_real_features
         self.num_static_categorical_features = num_static_categorical_features
-        self.cardinality = cardinality if cardinality and num_static_categorical_features > 0 else [1]
-        self.embedding_dimension = embedding_dimension or [min(50, (cat + 1) // 2) for cat in self.cardinality]
+        if cardinality and num_static_categorical_features > 0:
+            if len(cardinality) != num_static_categorical_features:
+                raise ValueError("The cardinality should be a list having the same length as `num_static_categorical_features`")
+            self.cardinality = cardinality
+        else:
+            self.cardinality = [1]
+        if embedding_dimension and num_static_categorical_features > 0:
+            if len(embedding_dimension) != num_static_categorical_features:
+                raise ValueError("The embedding dimension should be a list having the same length as `num_static_categorical_features`")
+            self.embedding_dimension = embedding_dimension
+        else:
+            self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
         self.num_parallel_samples = num_parallel_samples
 
         # Transformer architecture configuration
-        self.d_model = input_size * len(lags_seq) + self._number_of_features
+        self.d_model = input_size * len(lags_sequence) + self._number_of_features
         self.encoder_attention_heads = encoder_attention_heads
         self.decoder_attention_heads = decoder_attention_heads
         self.encoder_ffn_dim = encoder_ffn_dim
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 1e1c4e8810100..cdf6368326dba 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -38,7 +38,6 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "huggingface/time-series-transformer-tourism-monthly"
 _CONFIG_FOR_DOC = "TimeSeriesTransformerConfig"
 
 
@@ -1459,7 +1458,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
 
     @property
     def _past_length(self) -> int:
-        return self.config.context_length + max(self.config.lags_seq)
+        return self.config.context_length + max(self.config.lags_sequence)
 
     def get_lagged_subsequences(
         self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
@@ -1477,7 +1476,7 @@ def get_lagged_subsequences(
                 shift the lags by this amount back.
         """
         sequence_length = sequence.shape[1]
-        indices = [lag - shift for lag in self.config.lags_seq]
+        indices = [lag - shift for lag in self.config.lags_sequence]
 
         try:
             assert max(indices) + subsequences_length <= sequence_length, (
@@ -1622,8 +1621,8 @@ def forward(
         >>> num_time_features = 10
         >>> content_length = 8
         >>> prediction_length = 2
-        >>> lags_seq = [2, 3]
-        >>> past_length = context_length + max(lags_seq)
+        >>> lags_sequence = [2, 3]
+        >>> past_length = context_length + max(lags_sequence)
 
         >>> # encoder inputs
         >>> inputs["static_categorical_features"] = ids_tensor([batch_size, 1], cardinality)
@@ -1728,6 +1727,8 @@ def __init__(self, config: TimeSeriesTransformerConfig):
 
         if config.loss == "nll":
             self.loss = NegativeLogLikelihood()
+        else:
+            raise ValueError(f"Unknown loss function {config.loss}")
 
         # Initialize weights of distribution_output and apply final processing
         self.post_init()
@@ -1783,8 +1784,8 @@ def forward(
         >>> num_time_features = 10
         >>> content_length = 8
         >>> prediction_length = 2
-        >>> lags_seq = [2, 3]
-        >>> past_length = context_length + max(lags_seq)
+        >>> lags_sequence = [2, 3]
+        >>> past_length = context_length + max(lags_sequence)
 
         >>> # encoder inputs
         >>> inputs["static_categorical_features"] = ids_tensor([batch_size, 1], cardinality)
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 9c581de3d7974..2504bcf43d2fe 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -58,7 +58,7 @@ def __init__(
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
-        lags_seq=[1, 2, 3, 4, 5],
+        lags_sequence=[1, 2, 3, 4, 5],
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -66,7 +66,7 @@ def __init__(
         self.context_length = context_length
         self.cardinality = cardinality
         self.num_time_features = num_time_features
-        self.lags_seq = lags_seq
+        self.lags_sequence = lags_sequence
         self.embedding_dimension = embedding_dimension
         self.is_training = is_training
         self.hidden_size = hidden_size
@@ -82,7 +82,7 @@ def __init__(
         self.decoder_seq_length = prediction_length
 
     def prepare_config_and_inputs(self):
-        _past_length = self.context_length + max(self.lags_seq)
+        _past_length = self.context_length + max(self.lags_sequence)
 
         static_categorical_features = ids_tensor([self.batch_size, 1], self.cardinality)
         static_real_features = floats_tensor([self.batch_size, 1])
@@ -106,7 +106,7 @@ def prepare_config_and_inputs(self):
             attention_dropout=self.attention_probs_dropout_prob,
             prediction_length=self.prediction_length,
             context_length=self.context_length,
-            lags_seq=self.lags_seq,
+            lags_sequence=self.lags_sequence,
             num_time_features=self.num_time_features,
             num_static_categorical_features=1,
             cardinality=[self.cardinality],

From 4e1d1f6751a840ee3367558fe632e380259236f5 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 28 Sep 2022 09:04:50 +0000
Subject: [PATCH 131/164] Improve variable names

---
 .../modeling_time_series_transformer.py       | 37 ++++++++++---------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index cdf6368326dba..52b3d1b544614 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -150,7 +150,7 @@ def value_in_support(self) -> float:
         """
         return 0.0
 
-    def get_param_proj(self, in_features: int) -> nn.Module:
+    def get_parameter_projection(self, in_features: int) -> nn.Module:
         r"""
         Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
         """
@@ -416,7 +416,7 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 
 
 @dataclass
-class Seq2SeqTSModelOutput(ModelOutput):
+class Seq2SeqTimeSeriesModelOutput(ModelOutput):
     """
     Base class for model encoder's outputs that also contains pre-computed hidden states that can speed up sequential
     decoding.
@@ -484,7 +484,7 @@ class Seq2SeqTSModelOutput(ModelOutput):
 
 
 @dataclass
-class Seq2SeqTSPredictionOutput(ModelOutput):
+class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
     """
     Base class for model's predictions outputs that also contain the loss as well parameters of the chosen
     distribution.
@@ -552,7 +552,7 @@ class Seq2SeqTSPredictionOutput(ModelOutput):
 
 
 @dataclass
-class SampleTSPredictionOutput(ModelOutput):
+class SampleTimeSeriesPredictionOutput(ModelOutput):
     sequences: torch.FloatTensor = None
 
 
@@ -1432,7 +1432,7 @@ def custom_forward(*inputs):
 
 
 @add_start_docstrings(
-    "The bare TimeSeriesTransformer Model outputting raw hidden-states without any specific head on top.",
+    "The bare Time Series Transformer Model outputting raw hidden-states without any specific head on top.",
     TIME_SERIES_TRANSFORMER_START_DOCSTRING,
 )
 class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
@@ -1449,7 +1449,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
             embedding_dims=config.embedding_dimension,
         )
 
-        # transformer enc-decoder and mask initializer
+        # transformer encoder-decoder and mask initializer
         self.encoder = TimeSeriesTransformerEncoder(config)
         self.decoder = TimeSeriesTransformerDecoder(config)
 
@@ -1467,13 +1467,14 @@ def get_lagged_subsequences(
         Returns lagged subsequences of a given sequence. Returns a tensor of shape (N, S, C, I),
             where S = subsequences_length and I = len(indices), containing lagged subsequences. Specifically, lagged[i,
             j, :, k] = sequence[i, -indices[k]-S+j, :].
+        
         Args:
-            sequence : Tensor
-                the sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
+            sequence: Tensor
+                The sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
             subsequences_length : int
-                length of the subsequences to be extracted.
+                Length of the subsequences to be extracted.
             shift: int
-                shift the lags by this amount back.
+                Shift the lags by this amount back.
         """
         sequence_length = sequence.shape[1]
         indices = [lag - shift for lag in self.config.lags_sequence]
@@ -1582,7 +1583,7 @@ def get_decoder(self):
         return self.decoder
 
     @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         past_values: torch.Tensor,
@@ -1691,7 +1692,7 @@ def forward(
         if not return_dict:
             return decoder_outputs + encoder_outputs + (scale, static_feat)
 
-        return Seq2SeqTSModelOutput(
+        return Seq2SeqTimeSeriesModelOutput(
             last_hidden_state=decoder_outputs.last_hidden_state,
             past_key_values=decoder_outputs.past_key_values,
             decoder_hidden_states=decoder_outputs.hidden_states,
@@ -1722,7 +1723,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         else:
             raise ValueError(f"Unknown distribution output {config.distribution_output}")
 
-        self.param_proj = self.distribution_output.get_param_proj(self.model.config.d_model)
+        self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.d_model)
         self.target_shape = self.distribution_output.event_shape
 
         if config.loss == "nll":
@@ -1734,7 +1735,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         self.post_init()
 
     def output_params(self, dec_output):
-        return self.param_proj(dec_output)
+        return self.parameter_projection(dec_output)
 
     def get_encoder(self):
         return self.model.get_encoder()
@@ -1750,7 +1751,7 @@ def output_distribution(self, params, scale=None, trailing_n=None) -> torch.dist
         return self.distribution_output.distribution(sliced_params, scale=scale)
 
     @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         past_values: torch.Tensor,
@@ -1843,7 +1844,7 @@ def forward(
             outputs = ((params,) + outputs[1:]) if params is not None else outputs[1:]
             return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
 
-        return Seq2SeqTSPredictionOutput(
+        return Seq2SeqTimeSeriesPredictionOutput(
             loss=prediction_loss,
             params=params,
             past_key_values=outputs.past_key_values,
@@ -1917,7 +1918,7 @@ def generate(
             dec_output = decoder(inputs_embeds=decoder_input, encoder_hidden_states=repeated_enc_last_hidden)
             dec_last_hidden = dec_output.last_hidden_state
 
-            params = self.param_proj(dec_last_hidden[:, -1:])
+            params = self.parameter_projection(dec_last_hidden[:, -1:])
             distr = self.output_distribution(params, scale=repeated_scale)
             next_sample = distr.sample()
 
@@ -1926,7 +1927,7 @@ def generate(
 
         concat_future_samples = torch.cat(future_samples, dim=1)
 
-        return SampleTSPredictionOutput(
+        return SampleTimeSeriesPredictionOutput(
             sequences=concat_future_samples.reshape(
                 (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
             )

From 6ceed7c2bedcced25fb9dc62573b61978f2a450b Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 28 Sep 2022 09:31:50 +0000
Subject: [PATCH 132/164] Improve docs

---
 .../configuration_time_series_transformer.py  | 29 +++++--
 .../modeling_time_series_transformer.py       | 85 ++++++++++---------
 .../test_modeling_time_series_transformer.py  |  4 +-
 3 files changed, 68 insertions(+), 50 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 58c89ba4ee6e5..36b960439eff4 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -23,7 +23,9 @@
 logger = logging.get_logger(__name__)
 
 TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "huggingface/time-series-transformer-tourism-monthly": "https://huggingface.co/huggingface/time-series-transformer-tourism-monthly/resolve/main/config.json",
+    "huggingface/time-series-transformer-tourism-monthly": (
+        "https://huggingface.co/huggingface/time-series-transformer-tourism-monthly/resolve/main/config.json"
+    ),
     # See all TimeSeriesTransformer models at https://huggingface.co/models?filter=time_series_transformer
 }
 
@@ -33,7 +35,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`TimeSeriesTransformerModel`]. It is used to
     instantiate a Time Series Transformer model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the Time Series
-    Transformer [huggingface/time-series-transformer-tourism-monthly](https://huggingface.co/huggingface/time-series-transformer-tourism-monthly)
+    Transformer
+    [huggingface/time-series-transformer-tourism-monthly](https://huggingface.co/huggingface/time-series-transformer-tourism-monthly)
     architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
@@ -51,7 +54,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             The loss function for the model corresponding to the `distribution_output` head. For parametric
             distributions it is the negative log likelihood (nll) - which currently is the only supported one.
         input_size (`int`, *optional*, defaults to 1):
-            The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of multivarate targets. 
+            The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
+            multivarate targets.
         scaling (`bool`, *optional* defaults to `True`):
             Whether to scale the input targets.
         lags_sequence (`list[int]`, *optional*, defaults to [1, 2, 3, 4, 5, 6, 7]):
@@ -66,11 +70,13 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
         num_static_real_features (`int`, *optional*, defaults to 0):
             The number of static real valued features.
         cardinality (`list[int]`, *optional*):
-            The cardinality (number of different values) for each of the static categorical features. Should be a list of integers, having the same
-            length as `num_static_categorical_features`. Cannot be `None` if `num_static_categorical_features` is > 0.
+            The cardinality (number of different values) for each of the static categorical features. Should be a list
+            of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
         embedding_dimension (`list[int]`, *optional*):
-            The dimension of the embedding for each of the static categorical features. Should be a list of integers, having the same
-            length as `num_static_categorical_features`. Cannot be `None` if `num_static_categorical_features` is > 0.
+            The dimension of the embedding for each of the static categorical features. Should be a list of integers,
+            having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
         encoder_layers (`int`, *optional*, defaults to 2):
             Number of encoder layers.
         decoder_layers (`int`, *optional*, defaults to 2):
@@ -171,13 +177,18 @@ def __init__(
         self.num_static_categorical_features = num_static_categorical_features
         if cardinality and num_static_categorical_features > 0:
             if len(cardinality) != num_static_categorical_features:
-                raise ValueError("The cardinality should be a list having the same length as `num_static_categorical_features`")
+                raise ValueError(
+                    "The cardinality should be a list having the same length as `num_static_categorical_features`"
+                )
             self.cardinality = cardinality
         else:
             self.cardinality = [1]
         if embedding_dimension and num_static_categorical_features > 0:
             if len(embedding_dimension) != num_static_categorical_features:
-                raise ValueError("The embedding dimension should be a list having the same length as `num_static_categorical_features`")
+                raise ValueError(
+                    "The embedding dimension should be a list having the same length as"
+                    " `num_static_categorical_features`"
+                )
             self.embedding_dimension = embedding_dimension
         else:
             self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 52b3d1b544614..069b1078c2c58 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -932,35 +932,38 @@ def _set_gradient_checkpointing(self, module, value=False):
 TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
     Args:
         past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Past values of the time series, that serve as context in order to predict the future. These values may contain lags,
-            i.e. additional values from the past which are added in order to serve as "extra context". The `past_values` is what
-            the Transformer encoder gets as input (with optional additional features, such as `static_categorical_features`, `static_real_features`,
-            `past_time_featuresuresures`).
+            Past values of the time series, that serve as context in order to predict the future. These values may
+            contain lags, i.e. additional values from the past which are added in order to serve as "extra context".
+            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
+            `static_categorical_features`, `static_real_features`, `past_time_featuresuresures`).
 
-            See the demo notebook and code snippets for details.
+            The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
 
             Missing values need to be replaced with zeros.
 
-        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Boolean mask to indicate which `past_values` values were observed and which were missing. Mask values selected in `[0, 1]`:
-
-            - 1 for values that are **observed**,
-            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-
         past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
-            Optional time features, which the model internally will add to `past_values`. These could be things like "month of year",
-            "day of the month", etc. encoded as vectors (for instance as Fourier features). These could also be so-called "age" features,
-            which basically help the model know "at which point in life" a time-series is. Age features have small values for distant past
-            time steps and increase monotonically the more we approach the current time step.
+            Optional time features, which the model internally will add to `past_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step.
 
-            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where the position encodings
-            are learned from scratch internally as parameters of the model, the Time Series Transformer requires to provide additional time features.
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features.
 
             The Time Series Transformer only learns additional embeddings for `static_categorical_features`.
 
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
+            `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
         static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
-            Optional static categorical features for which the model will learn an embedding, which it will add to the values
-            of the time series.
+            Optional static categorical features for which the model will learn an embedding, which it will add to the
+            values of the time series.
 
             Static categorical features are features which have the same value for all time steps (static over time).
 
@@ -974,41 +977,38 @@ def _set_gradient_checkpointing(self, module, value=False):
             A typical example of a static real feature is promotion information.
 
         future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
-            Future values of the time series, that serve as labels for the model. The `future_values` is what the Transformer
-            needs to learn to output, given the `past_values`.
+            Future values of the time series, that serve as labels for the model. The `future_values` is what the
+            Transformer needs to learn to output, given the `past_values`.
 
             See the demo notebook and code snippets for details.
 
             Missing values need to be replaced with zeros.
 
         future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
-            Optional time features, which the model internally will add to `future_values`. These could be things like "month of year",
-            "day of the month", etc. encoded as vectors (for instance as Fourier features). These could also be so-called "age" features,
-            which basically help the model know "at which point in life" a time-series is. Age features have small values for distant past
-            time steps and increase monotonically the more we approach the current time step.
+            Optional time features, which the model internally will add to `future_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step.
 
-            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where the position encodings
-            are learned from scratch internally as parameters of the model, the Time Series Transformer requires to provide additional features.
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional features.
 
             The Time Series Transformer only learns additional embeddings for `static_categorical_features`.
-        
+
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
 
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
 
             [What are attention masks?](../glossary#attention-mask)
-        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Provide for translation and summarization training. By default, the model will create this tensor by
-            shifting the `input_ids` to the right, following the paper.
+
         decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
-            be used by default.
+            Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
+            make sure the model can only look at previous inputs in order to predict the future.
 
-            If you want to change padding behavior, you should read
-            [`modeling_time_series_transformer._prepare_decoder_attention_mask`] and modify to your needs. See diagram
-            1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
         head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
             Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
 
@@ -1467,7 +1467,7 @@ def get_lagged_subsequences(
         Returns lagged subsequences of a given sequence. Returns a tensor of shape (N, S, C, I),
             where S = subsequences_length and I = len(indices), containing lagged subsequences. Specifically, lagged[i,
             j, :, k] = sequence[i, -indices[k]-S+j, :].
-        
+
         Args:
             sequence: Tensor
                 The sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
@@ -1771,6 +1771,15 @@ def forward(
         r"""
         Returns:
 
+        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            This mask is used to filter out missing values for the final loss calculation.
+
         Examples:
 
         ```python
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 2504bcf43d2fe..ad39cd323aa72 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -354,9 +354,7 @@ def test_attention_outputs(self):
 @slow
 class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
     def test_inference_no_head(self):
-        model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
-            torch_device
-        )
+        # model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(torch_device)
 
         raise NotImplementedError("To do")
 

From b7ce7661dc17eaea48dbb0315c00c1ffaa0cb0e0 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 28 Sep 2022 09:35:58 +0000
Subject: [PATCH 133/164] Remove key_length from tests

---
 .../test_modeling_time_series_transformer.py          | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index ad39cd323aa72..82651f9768851 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -78,7 +78,6 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
 
         self.encoder_seq_length = context_length
-        self.key_length = context_length
         self.decoder_seq_length = prediction_length
 
     def prepare_config_and_inputs(self):
@@ -260,8 +259,6 @@ def test_attention_outputs(self):
         seq_len = getattr(self.model_tester, "seq_length", None)
         decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
         encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
 
         for model_class in self.all_model_classes:
             inputs_dict["output_attentions"] = True
@@ -288,7 +285,7 @@ def test_attention_outputs(self):
 
             self.assertListEqual(
                 list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_seq_length],
             )
             out_len = len(outputs)
 
@@ -314,7 +311,7 @@ def test_attention_outputs(self):
             self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
             self.assertListEqual(
                 list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_seq_length],
             )
 
             # cross attentions
@@ -326,7 +323,7 @@ def test_attention_outputs(self):
                 [
                     self.model_tester.num_attention_heads,
                     decoder_seq_length,
-                    encoder_key_length,
+                    encoder_seq_length,
                 ],
             )
 
@@ -346,7 +343,7 @@ def test_attention_outputs(self):
         self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
         self.assertListEqual(
             list(self_attentions[0].shape[-3:]),
-            [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            [self.model_tester.num_attention_heads, encoder_seq_length, encoder_seq_length],
         )
 
 

From bac12b9ad9b64ac9485aeaa26114f41f72356431 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Wed, 28 Sep 2022 10:13:04 +0000
Subject: [PATCH 134/164] Add extra docs

---
 .../en/model_doc/time_series_transformer.mdx  | 32 ++++++++++++++++---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/model_doc/time_series_transformer.mdx b/docs/source/en/model_doc/time_series_transformer.mdx
index a734dd6f596ff..5dedef02eaa80 100644
--- a/docs/source/en/model_doc/time_series_transformer.mdx
+++ b/docs/source/en/model_doc/time_series_transformer.mdx
@@ -21,13 +21,37 @@ breaking changes to fix it in the future. If you see something strange, file a [
 
 ## Overview
 
-The Time Series Transformer model is a vanilla encoder-decoder Transformer for time series forecasting and classification.
+The Time Series Transformer model is a vanilla encoder-decoder Transformer for time series forecasting.
 
 Tips:
 
-- The model is trained using "teacher-forcing", similar to machine translation. This means that, during training, one provides the ground truth
-previous targets to the model rather than the model's predictions in order to predict the next target. Only at inference time, we sample from the model
-to make a prediction at each time step, which is then fed to the model in order to make the next prediction (also called autoregressive generation).
+- Similar to other models in the library, [`TimeSeriesTransformerModel`] is the raw Transformer without any head on top, and [`TimeSeriesTransformerForPrediction`]
+adds a distribution head on top of the former, which can be used for time-series forecasting. Note that this is a so-called probabilistic forecasting model, not a
+point forecasting model. This means that the model learns a distribution, from which one can sample. The model doesn't directly output values.
+- [`TimeSeriesTransformerForPrediction`] consists of 2 blocks: an encoder, which takes a `context_length` of time series values as input (called `past_values`),
+and a decoder, which predicts a `prediction_length` of time series values into the future (called `future_values`). During training, one needs to provide
+pairs of (`past_values` and `future_values`) to the model.
+- In addition to the raw (`past_values` and `future_values`), one typically provides additional features to the model. These can be the following:
+    - `past_time_features`: temporal features which the model will add to `past_values`. These serve as "positional encodings" for the Transformer encoder.
+    Examples are "day of the month", "month of the year", etc. as scalar values (and then stacked together as a vector).
+    e.g. if a given time-series value was obtained on the 11th of August, then one could have [11, 8] as time feature vector (11 being "day of the month", 8 being "month of the year").
+    - `future_time_features`: temporal features which the model will add to `future_values`. These serve as "positional encodings" for the Transformer decoder.
+    Examples are "day of the month", "month of the year", etc. as scalar values (and then stacked together as a vector).
+    e.g. if a given time-series value was obtained on the 11th of August, then one could have [11, 8] as time feature vector (11 being "day of the month", 8 being "month of the year").
+    - `static_categorical_features`: categorical features which are static over time (i.e., have the same value for all `past_values` and `future_values`).
+    An example here is the store ID or region ID that identifies a given time-series.
+    Note that these features need to be known for ALL data points (also those in the future).
+    - `static_real_features`: real-valued features which are static over time (i.e., have the same value for all `past_values` and `future_values`).
+    An example here is the image representation of the product for which you have the time-series values (like the [ResNet](resnet) embedding of a "shoe" picture,
+    if your time-series is about the sales of shoes).
+    Note that these features need to be known for ALL data points (also those in the future).
+- The model is trained using "teacher-forcing", similar to how a Transformer is trained for machine translation. This means that, during training, one shifts the
+`future_values` one position to the right as input to the decoder, prepended by the last value of `past_values`. At each time step, the model needs to predict the
+next target. So the set-up of training is similar to a GPT model for language, except that there's no notion of `decoder_start_token_id` (we just use the last value
+of the context as initial input for the decoder).
+- At inference time, we give the final value of the `past_values` as input to the decoder. Next, we can sample from the model to make a prediction at the next time step,
+which is then fed to the decoder in order to make the next prediction (also called autoregressive generation).
+
 
 This model was contributed by [kashif](<https://huggingface.co/kashif).
 

From de882609ec0e5ee6ec972f88aabcc4cfafc48a68 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 28 Sep 2022 14:04:45 +0200
Subject: [PATCH 135/164] initial unittests

---
 .../test_modeling_time_series_transformer.py  | 62 +++++++++++++------
 1 file changed, 43 insertions(+), 19 deletions(-)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 82651f9768851..e2c0104fa8d8f 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -24,6 +24,7 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 
+TOLERANCE = 1e-4
 
 if is_torch_available():
     import torch
@@ -80,21 +81,8 @@ def __init__(
         self.encoder_seq_length = context_length
         self.decoder_seq_length = prediction_length
 
-    def prepare_config_and_inputs(self):
-        _past_length = self.context_length + max(self.lags_sequence)
-
-        static_categorical_features = ids_tensor([self.batch_size, 1], self.cardinality)
-        static_real_features = floats_tensor([self.batch_size, 1])
-
-        past_time_features = floats_tensor([self.batch_size, _past_length, self.num_time_features])
-        past_values = floats_tensor([self.batch_size, _past_length])
-        past_observed_mask = floats_tensor([self.batch_size, _past_length])
-
-        # decoder inputs
-        future_time_features = floats_tensor([self.batch_size, self.prediction_length, self.num_time_features])
-        future_values = floats_tensor([self.batch_size, self.prediction_length])
-
-        config = TimeSeriesTransformerConfig(
+    def get_config(self):
+        return TimeSeriesTransformerConfig(
             encoder_layers=self.num_hidden_layers,
             decoder_layers=self.num_hidden_layers,
             encoder_attention_heads=self.num_attention_heads,
@@ -112,6 +100,20 @@ def prepare_config_and_inputs(self):
             embedding_dimension=[self.embedding_dimension],
         )
 
+    def prepare_time_series_transformer_inputs_dict(self, config):
+        _past_length = config.context_length + max(config.lags_sequence)
+
+        static_categorical_features = ids_tensor([self.batch_size, 1], config.cardinality[0])
+        static_real_features = floats_tensor([self.batch_size, 1])
+
+        past_time_features = floats_tensor([self.batch_size, _past_length, config.num_time_features])
+        past_values = floats_tensor([self.batch_size, _past_length])
+        past_observed_mask = floats_tensor([self.batch_size, _past_length])
+
+        # decoder inputs
+        future_time_features = floats_tensor([self.batch_size, config.prediction_length, config.num_time_features])
+        future_values = floats_tensor([self.batch_size, config.prediction_length])
+
         inputs_dict = {
             "past_values": past_values,
             "static_categorical_features": static_categorical_features,
@@ -121,6 +123,11 @@ def prepare_config_and_inputs(self):
             "future_time_features": future_time_features,
             "future_values": future_values,
         }
+        return inputs_dict
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        inputs_dict = self.prepare_time_series_transformer_inputs_dict(config)
         return config, inputs_dict
 
     def prepare_config_and_inputs_for_common(self):
@@ -351,14 +358,31 @@ def test_attention_outputs(self):
 @slow
 class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
     def test_inference_no_head(self):
-        # model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(torch_device)
+        model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
+            torch_device
+        )
+        inputs_dict = prepare_time_series_transformer_inputs_dict(model.config)
+        with torch.no_grad():
+            output = model(**inputs_dict)
 
-        raise NotImplementedError("To do")
+        expected_shape = torch.Size((1, 11, 1024))
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = torch.tensor(
+            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
 
     def test_inference_head(self):
-        # model = TimeSeriesTransformerForPrediction.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(torch_device)
-
+        model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
+            torch_device
+        )
+    
         raise NotImplementedError("To do")
 
     def test_seq_to_seq_generation(self):
+         model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
+            torch_device
+        )
+
         raise NotImplementedError("Generation not implemented yet")

From 4f858f37dcbd42fabed83ed326540cb47769c26b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 28 Sep 2022 19:34:07 +0200
Subject: [PATCH 136/164] added test_inference_no_head test

---
 .../test_modeling_time_series_transformer.py  | 50 ++++++++++++-------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index e2c0104fa8d8f..656e2b903e3fd 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -18,6 +18,7 @@
 import tempfile
 import unittest
 
+from huggingface_hub import hf_hub_download
 from transformers import is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 
@@ -354,6 +355,12 @@ def test_attention_outputs(self):
         )
 
 
+def prepare_batch(filename="train-batch.pt"):
+    file = hf_hub_download(repo_id="kashif/tourism-monthly-batch", filename=filename, repo_type="dataset")
+    batch = torch.load(file, map_location=torch_device)
+    return batch
+
+
 @require_torch
 @slow
 class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
@@ -361,28 +368,37 @@ def test_inference_no_head(self):
         model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
             torch_device
         )
-        inputs_dict = prepare_time_series_transformer_inputs_dict(model.config)
-        with torch.no_grad():
-            output = model(**inputs_dict)
+        batch = prepare_batch()
 
-        expected_shape = torch.Size((1, 11, 1024))
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"],
+                past_time_features=batch["past_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+                static_categorical_features=batch["static_categorical_features"],
+                static_real_features=batch["static_real_features"],
+                future_values=batch["future_values"],
+                future_time_features=batch["future_time_features"],
+            )[0]
+
+        expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model))
         self.assertEqual(output.shape, expected_shape)
         # change to expected output here
         expected_slice = torch.tensor(
-            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
+            [[-0.3125, -1.2884, -1.1118], [-0.5801, -1.4907, -0.7782], [0.0849, -1.6557, -0.9755]], device=torch_device
         )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
-    def test_inference_head(self):
-        model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
-            torch_device
-        )
-    
-        raise NotImplementedError("To do")
+    # def test_inference_head(self):
+    #     model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
+    #         torch_device
+    #     )
 
-    def test_seq_to_seq_generation(self):
-         model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
-            torch_device
-        )
+    #     raise NotImplementedError("To do")
+
+    # def test_seq_to_seq_generation(self):
+    #      model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
+    #         torch_device
+    #     )
 
-        raise NotImplementedError("Generation not implemented yet")
+    #     raise NotImplementedError("Generation not implemented yet")

From 3d05dadae69ef98e1b6644292d7f9e3a00942df2 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 28 Sep 2022 19:50:25 +0200
Subject: [PATCH 137/164] added test_inference_head

---
 .../test_modeling_time_series_transformer.py  | 27 ++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 656e2b903e3fd..6dd106ac8630c 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -389,12 +389,27 @@ def test_inference_no_head(self):
         )
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
-    # def test_inference_head(self):
-    #     model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
-    #         torch_device
-    #     )
-
-    #     raise NotImplementedError("To do")
+    def test_inference_head(self):
+        model = TimeSeriesTransformerForPrediction.from_pretrained(
+            "huggingface/time-series-transformer-tourism-monthly"
+        ).to(torch_device)
+        batch = prepare_batch("val-batch.pt")
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"],
+                past_time_features=batch["past_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+                static_categorical_features=batch["static_categorical_features"],
+                static_real_features=batch["static_real_features"],
+                future_time_features=batch["future_time_features"],
+            )[1]
+        expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model))
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = torch.tensor(
+            [[0.9127, -0.2056, -0.5259], [1.0572, 1.4104, -0.1964], [0.1358, 2.0348, 0.5739]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
     # def test_seq_to_seq_generation(self):
     #      model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(

From 93ff6599133d85b801136e76fcc379104ce3f7ae Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 28 Sep 2022 19:56:30 +0200
Subject: [PATCH 138/164] add test_seq_to_seq_generation

---
 .../test_modeling_time_series_transformer.py  | 28 +++++++++++++++----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 6dd106ac8630c..c574d6bacad78 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -411,9 +411,25 @@ def test_inference_head(self):
         )
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
-    # def test_seq_to_seq_generation(self):
-    #      model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
-    #         torch_device
-    #     )
-
-    #     raise NotImplementedError("Generation not implemented yet")
+    def test_seq_to_seq_generation(self):
+        model = TimeSeriesTransformerForPrediction.from_pretrained(
+            "huggingface/time-series-transformer-tourism-monthly"
+        ).to(torch_device)
+        batch = prepare_batch("val-batch.pt")
+        with torch.no_grad():
+            outputs = model.generate(
+                static_categorical_features=batch["static_categorical_features"],
+                static_real_features=batch["static_real_features"],
+                past_time_features=batch["past_time_features"],
+                past_values=batch["past_values"],
+                future_time_features=batch["future_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+            )
+        expected_shape = torch.Size(
+            (
+                64,
+                model.config.num_parallel_samples,
+                model.config.prediction_length,
+            )
+        )
+        self.assertEqual(outputs.sequences.shape, expected_shape)

From 2339a53304c9fb2350b4bce3f3f195a7cf81014c Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 28 Sep 2022 19:58:47 +0200
Subject: [PATCH 139/164] make style

---
 .../test_modeling_time_series_transformer.py                     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index c574d6bacad78..f10434b35e5bd 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -25,6 +25,7 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 
+
 TOLERANCE = 1e-4
 
 if is_torch_available():

From 4c05158bcf1bff298dd6ff9b4ea11c5cfadabf29 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 28 Sep 2022 20:00:35 +0200
Subject: [PATCH 140/164] one line

---
 .../test_modeling_time_series_transformer.py              | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index f10434b35e5bd..615a8d0cbd9ad 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -426,11 +426,5 @@ def test_seq_to_seq_generation(self):
                 future_time_features=batch["future_time_features"],
                 past_observed_mask=batch["past_observed_mask"],
             )
-        expected_shape = torch.Size(
-            (
-                64,
-                model.config.num_parallel_samples,
-                model.config.prediction_length,
-            )
-        )
+        expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
         self.assertEqual(outputs.sequences.shape, expected_shape)

From 0df5859a6cb617a9b7ecb75c42da5d3a6b7d673e Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 28 Sep 2022 20:19:48 +0200
Subject: [PATCH 141/164] assert mean prediction

---
 .../test_modeling_time_series_transformer.py                  | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 615a8d0cbd9ad..f3ea6bae39dc4 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -428,3 +428,7 @@ def test_seq_to_seq_generation(self):
             )
         expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
         self.assertEqual(outputs.sequences.shape, expected_shape)
+
+        expected_slice = torch.tensor([2289.5203, 2778.3054, 4648.1313], device=torch_device)
+        mean_prediction = outputs.sequences.mean(dim=1)
+        self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))

From d31c0de291066b97cf18258d05f864f2f275e8cb Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 28 Sep 2022 20:20:44 +0200
Subject: [PATCH 142/164] removed comments

---
 .../test_modeling_time_series_transformer.py                  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index f3ea6bae39dc4..c3ab42833aa4a 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -384,7 +384,7 @@ def test_inference_no_head(self):
 
         expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model))
         self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
+
         expected_slice = torch.tensor(
             [[-0.3125, -1.2884, -1.1118], [-0.5801, -1.4907, -0.7782], [0.0849, -1.6557, -0.9755]], device=torch_device
         )
@@ -406,7 +406,7 @@ def test_inference_head(self):
             )[1]
         expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model))
         self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
+
         expected_slice = torch.tensor(
             [[0.9127, -0.2056, -0.5259], [1.0572, 1.4104, -0.1964], [0.1358, 2.0348, 0.5739]], device=torch_device
         )

From 0827fbe691ece51bc8ab5bf972b098fe01c84ac2 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 29 Sep 2022 09:41:23 +0200
Subject: [PATCH 143/164] Update
 src/transformers/models/time_series_transformer/modeling_time_series_transformer.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../modeling_time_series_transformer.py                        | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 069b1078c2c58..c9e3e6ce01b05 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1593,7 +1593,6 @@ def forward(
         static_real_features: torch.Tensor,
         future_values: Optional[torch.Tensor] = None,
         future_time_features: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         decoder_head_mask: Optional[torch.Tensor] = None,
@@ -1601,8 +1600,8 @@ def forward(
         encoder_outputs: Optional[List[torch.FloatTensor]] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         output_hidden_states: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ):
         r"""

From c8b56cc439630f6b38f268bd969f34c2d65aa455 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 29 Sep 2022 09:41:46 +0200
Subject: [PATCH 144/164] Update
 src/transformers/models/time_series_transformer/modeling_time_series_transformer.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../modeling_time_series_transformer.py                  | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index c9e3e6ce01b05..fc332a21f1a89 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1761,10 +1761,15 @@ def forward(
         future_values: Optional[torch.Tensor] = None,
         future_time_features: Optional[torch.Tensor] = None,
         future_observed_mask: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
         output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ):
         r"""

From a2043c198b41d78d39ffc0236ed621dbadf889b4 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 29 Sep 2022 10:15:05 +0200
Subject: [PATCH 145/164] fix order of args

---
 .../modeling_time_series_transformer.py              | 11 +++++++----
 .../test_modeling_time_series_transformer.py         | 12 ++++++++----
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index fc332a21f1a89..bcea07b4f734c 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1659,7 +1659,6 @@ def forward(
             enc_input = transformer_inputs[:, : self.config.context_length, ...]
             encoder_outputs = self.encoder(
                 inputs_embeds=enc_input,
-                attention_mask=attention_mask,
                 head_mask=head_mask,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
@@ -1678,7 +1677,6 @@ def forward(
             inputs_embeds=dec_input,
             attention_mask=decoder_attention_mask,
             encoder_hidden_states=encoder_outputs[0],
-            encoder_attention_mask=attention_mask,
             head_mask=decoder_head_mask,
             cross_attn_head_mask=cross_attn_head_mask,
             past_key_values=past_key_values,
@@ -1828,11 +1826,16 @@ def forward(
             static_real_features=static_real_features,
             future_values=future_values,
             future_time_features=future_time_features,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
             encoder_outputs=encoder_outputs,
-            output_attentions=output_attentions,
+            past_key_values=past_key_values,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            output_attentions=output_attentions,
             use_cache=use_cache,
+            return_dict=return_dict,
         )
 
         prediction_loss = None
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index c3ab42833aa4a..d513f1fe21252 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -237,15 +237,19 @@ def test_forward_signature(self):
             expected_arg_names.extend(
                 [
                     "future_observed_mask",
+                    "decoder_attention_mask",
+                    "head_mask",
+                    "decoder_head_mask",
+                    "cross_attn_head_mask",
                     "encoder_outputs",
-                    "use_cache",
-                    "output_attentions",
+                    "past_key_values",
                     "output_hidden_states",
+                    "output_attentions",
+                    "use_cache",
                     "return_dict",
                 ]
                 if "future_observed_mask" in arg_names
                 else [
-                    "attention_mask",
                     "decoder_attention_mask",
                     "head_mask",
                     "decoder_head_mask",
@@ -253,8 +257,8 @@ def test_forward_signature(self):
                     "encoder_outputs",
                     "past_key_values",
                     "output_hidden_states",
-                    "use_cache",
                     "output_attentions",
+                    "use_cache",
                     "return_dict",
                 ]
             )

From a895801cd85cb53df6f2cc4de27432e58dcdf8e9 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 09:53:09 +0200
Subject: [PATCH 146/164] make past_observed_mask optional as well

---
 .../modeling_time_series_transformer.py           | 15 +++++++++------
 .../test_modeling_time_series_transformer.py      |  2 +-
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index bcea07b4f734c..b6b7444f0bf39 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1499,9 +1499,9 @@ def create_network_inputs(
         self,
         past_values: torch.Tensor,
         past_time_features: torch.Tensor,
-        past_observed_mask: torch.Tensor,
         static_categorical_features: torch.Tensor,
         static_real_features: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         future_time_features: Optional[torch.Tensor] = None,
     ):
@@ -1519,6 +1519,9 @@ def create_network_inputs(
         )
 
         # target
+        if past_observed_mask is None:
+            past_observed_mask = torch.ones_like(past_values)
+
         context = past_values[:, -self.config.context_length :]
         observed_context = past_observed_mask[:, -self.config.context_length :]
         _, scale = self.scaler(context, observed_context)
@@ -1588,11 +1591,11 @@ def forward(
         self,
         past_values: torch.Tensor,
         past_time_features: torch.Tensor,
-        past_observed_mask: torch.Tensor,
         static_categorical_features: torch.Tensor,
         static_real_features: torch.Tensor,
         future_values: Optional[torch.Tensor] = None,
         future_time_features: Optional[torch.Tensor] = None,
+        past_observed_mask: Optional[torch.Tensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         decoder_head_mask: Optional[torch.Tensor] = None,
@@ -1629,7 +1632,7 @@ def forward(
         >>> inputs["static_real_features"] = torch.randn([batch_size, 1])
         >>> inputs["past_time_features"] = torch.randn([batch_size, past_length, num_time_features])
         >>> inputs["past_values"] = torch.randn([batch_size, past_length])
-        >>> inputs["past_observed_mask"] = torch.randn([batch_size, past_length])
+        >>> inputs["past_observed_mask"] = torch.ones([batch_size, past_length])
 
         >>> # decoder inputs
         >>> inputs["future_time_features"] = torch.randn([batch_size, prediction_length, num_time_features])
@@ -1753,11 +1756,11 @@ def forward(
         self,
         past_values: torch.Tensor,
         past_time_features: torch.Tensor,
-        past_observed_mask: torch.Tensor,
         static_categorical_features: torch.Tensor,
         static_real_features: torch.Tensor,
         future_values: Optional[torch.Tensor] = None,
         future_time_features: Optional[torch.Tensor] = None,
+        past_observed_mask: Optional[torch.Tensor] = None,
         future_observed_mask: Optional[torch.Tensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
@@ -1804,7 +1807,7 @@ def forward(
         >>> inputs["static_real_features"] = torch.randn([batch_size, 1])
         >>> inputs["past_time_features"] = torch.randn([batch_size, past_length, num_time_features])
         >>> inputs["past_values"] = torch.randn([batch_size, past_length])
-        >>> inputs["past_observed_mask"] = torch.randn([batch_size, past_length])
+        >>> inputs["past_observed_mask"] = torch.ones([batch_size, past_length])
 
         >>> # decoder inputs
         >>> inputs["future_time_features"] = torch.randn([batch_size, prediction_length, num_time_features])
@@ -1881,8 +1884,8 @@ def generate(
         static_real_features: torch.Tensor,
         past_time_features: torch.Tensor,
         past_values: torch.Tensor,
-        past_observed_mask: torch.Tensor,
         future_time_features: Optional[torch.Tensor],
+        past_observed_mask: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> torch.Tensor:
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index d513f1fe21252..6c6d15daa0ce1 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -227,11 +227,11 @@ def test_forward_signature(self):
             expected_arg_names = [
                 "past_values",
                 "past_time_features",
-                "past_observed_mask",
                 "static_categorical_features",
                 "static_real_features",
                 "future_values",
                 "future_time_features",
+                "past_observed_mask",
             ]
 
             expected_arg_names.extend(

From 6840145a817b64ba92b032f394ba24bf1c4a5522 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 10:01:12 +0200
Subject: [PATCH 147/164] added Amazon license header

---
 .../time_series_transformer/modeling_time_series_transformer.py  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index b6b7444f0bf39..928e3fb304a3b 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1,5 +1,6 @@
 # coding=utf-8
 # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 2ec916eb86a87a863e09a170cc55028b589d3d82 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 12:26:50 +0200
Subject: [PATCH 148/164] updated utils with new fieldnames

---
 .../time-series-prediction/utils_ts.py        | 180 +++++++++---------
 1 file changed, 94 insertions(+), 86 deletions(-)

diff --git a/examples/pytorch/time-series-prediction/utils_ts.py b/examples/pytorch/time-series-prediction/utils_ts.py
index e1dfc1cfd19c9..0636e170346eb 100644
--- a/examples/pytorch/time-series-prediction/utils_ts.py
+++ b/examples/pytorch/time-series-prediction/utils_ts.py
@@ -17,6 +17,7 @@
 from functools import lru_cache
 from typing import Iterable, Optional
 
+import numpy as np
 import pandas as pd
 from torch.utils.data import DataLoader
 
@@ -39,6 +40,7 @@
     Transformation,
     ValidationSplitSampler,
     VstackFeatures,
+    RenameFields,
 )
 from gluonts.transform.sampler import InstanceSampler
 
@@ -48,44 +50,53 @@ def _as_period(val, freq):
     return pd.Period(val, freq)
 
 
-def transform_start_field(batch, freq):
+def transform_data(batch, freq, log1p_transform=False):
     batch[FieldName.START] = [_as_period(entry, freq) for entry in batch[FieldName.START]]
+    if log1p_transform:
+        batch[FieldName.TARGET] = np.log1p(batch[FieldName.TARGET])
     return batch
 
 
-def create_transformation(freq, config) -> Transformation:
+def create_transformation(freq: str, config: PretrainedConfig) -> Transformation:
     remove_field_names = []
-    if config.num_feat_static_real == 0:
+    if config.num_static_real_features == 0:
         remove_field_names.append(FieldName.FEAT_STATIC_REAL)
-    if config.num_feat_dynamic_real == 0:
+    if config.num_dynamic_real_features == 0:
         remove_field_names.append(FieldName.FEAT_DYNAMIC_REAL)
 
+    # a bit like torchvision.transforms.Compose
     return Chain(
+        # step 1: remove static/dynamic fields if not specified
         [RemoveFields(field_names=remove_field_names)]
-        + ([SetField(output_field=FieldName.FEAT_STATIC_CAT, value=[0])] if not config.num_feat_static_cat > 0 else [])
+        # step 2: use static features if available, if not add dummy values
+        + (
+            [SetField(output_field=FieldName.FEAT_STATIC_CAT, value=[0])]
+            if not config.num_static_categorical_features > 0
+            else []
+        )
         + (
             [SetField(output_field=FieldName.FEAT_STATIC_REAL, value=[0.0])]
-            if not config.num_feat_static_real > 0
+            if not config.num_static_real_features > 0
             else []
         )
+        # step 3: convert the data to NumPy (potentially not needed)
         + [
-            AsNumpyArray(
-                field=FieldName.FEAT_STATIC_CAT,
-                expected_ndim=1,
-                dtype=int,
-            ),
-            AsNumpyArray(
-                field=FieldName.FEAT_STATIC_REAL,
-                expected_ndim=1,
-            ),
+            AsNumpyArray(field=FieldName.FEAT_STATIC_CAT, expected_ndim=1, dtype=int),
+            AsNumpyArray(field=FieldName.FEAT_STATIC_REAL, expected_ndim=1),
             AsNumpyArray(
                 field=FieldName.TARGET,
+                # in the following line, we add 1 for the time dimension
                 expected_ndim=config.input_size,
             ),
-            AddObservedValuesIndicator(
-                target_field=FieldName.TARGET,
-                output_field=FieldName.OBSERVED_VALUES,
-            ),
+            # step 4: handle the NaN's by filling in the target with zero
+            # and return the mask (which is in the observed values)
+            # true for observed values, false for nan's
+            # the decoder uses this mask (no loss is incurred for unobserved values)
+            # see loss_weights inside the xxxForPrediction model
+            AddObservedValuesIndicator(target_field=FieldName.TARGET, output_field=FieldName.OBSERVED_VALUES),
+            # step 5: add temporal features based on freq of the dataset
+            # month of year in this case
+            # these serve as positional encodings
             AddTimeFeatures(
                 start_field=FieldName.START,
                 target_field=FieldName.TARGET,
@@ -93,23 +104,37 @@ def create_transformation(freq, config) -> Transformation:
                 time_features=time_features_from_frequency_str(freq),
                 pred_length=config.prediction_length,
             ),
+            # step 6: add another temporal feature (just a single number)
+            # tells the model where in the life the value of the time series is
+            # sort of running counter
             AddAgeFeature(
                 target_field=FieldName.TARGET,
                 output_field=FieldName.FEAT_AGE,
                 pred_length=config.prediction_length,
                 log_scale=True,
             ),
+            # step 7: vertically stack all the temporal features
             VstackFeatures(
                 output_field=FieldName.FEAT_TIME,
                 input_fields=[FieldName.FEAT_TIME, FieldName.FEAT_AGE]
-                + ([FieldName.FEAT_DYNAMIC_REAL] if config.num_feat_dynamic_real > 0 else []),
+                + ([FieldName.FEAT_DYNAMIC_REAL] if config.num_dynamic_real_features > 0 else []),
+            ),
+            # step 8: rename to match HuggingFace names
+            RenameFields(
+                mapping={
+                    FieldName.FEAT_STATIC_CAT: "static_categorical_features",
+                    FieldName.FEAT_STATIC_REAL: "static_real_features",
+                    FieldName.FEAT_TIME: "time_features",
+                    FieldName.TARGET: "values",
+                    FieldName.OBSERVED_VALUES: "observed_mask",
+                }
             ),
         ]
     )
 
 
 def create_instance_splitter(
-    config,
+    config: PretrainedConfig,
     mode: str,
     train_sampler: Optional[InstanceSampler] = None,
     validation_sampler: Optional[InstanceSampler] = None,
@@ -123,23 +148,20 @@ def create_instance_splitter(
     }[mode]
 
     return InstanceSplitter(
-        target_field=FieldName.TARGET,
+        target_field="values",
         is_pad_field=FieldName.IS_PAD,
         start_field=FieldName.START,
         forecast_start_field=FieldName.FORECAST_START,
         instance_sampler=instance_sampler,
-        past_length=config.context_length + max(config.lags_seq),
+        past_length=config.context_length + max(config.lags_sequence),
         future_length=config.prediction_length,
-        time_series_fields=[
-            FieldName.FEAT_TIME,
-            FieldName.OBSERVED_VALUES,
-        ],
+        time_series_fields=["time_features", "observed_mask"],
     )
 
 
-def create_training_data_loader(
+def create_train_dataloader(
+    config: PretrainedConfig,
     freq,
-    config,
     data,
     batch_size: int,
     num_batches_per_epoch: int,
@@ -147,69 +169,58 @@ def create_training_data_loader(
     **kwargs,
 ) -> Iterable:
     PREDICTION_INPUT_NAMES = [
-        FieldName.FEAT_STATIC_CAT,
-        FieldName.FEAT_STATIC_REAL,
-        "past_" + FieldName.FEAT_TIME,
-        "past_" + FieldName.TARGET,
-        "past_" + FieldName.OBSERVED_VALUES,
-        "future_" + FieldName.FEAT_TIME,
+        "static_categorical_features",
+        "static_real_features",
+        "past_time_features",
+        "past_values",
+        "past_observed_mask",
+        "future_time_features",
     ]
 
-    TRAINING_INPUT_NAMES = PREDICTION_INPUT_NAMES + [
-        "future_" + FieldName.TARGET,
-        "future_" + FieldName.OBSERVED_VALUES,
-    ]
+    TRAINING_INPUT_NAMES = PREDICTION_INPUT_NAMES + ["future_values", "future_observed_mask"]
 
     transformation = create_transformation(freq, config)
     transformed_data = transformation.apply(data, is_train=True)
 
+    # we initialize a Training instance splitter
     instance_splitter = create_instance_splitter(config, "train") + SelectFields(TRAINING_INPUT_NAMES)
 
+    # the instance splitter will sample a window of
+    # context length + lags + prediction length (from the transformed time series of a dataset)
+    # randomly from within the values of the time series and return another iterator.
     training_instances = instance_splitter.apply(
         Cyclic(transformed_data)
         if shuffle_buffer_length is None
-        else PseudoShuffled(
-            Cyclic(transformed_data),
-            shuffle_buffer_length=shuffle_buffer_length,
-        ),
-        is_train=True,
+        else PseudoShuffled(Cyclic(transformed_data), shuffle_buffer_length=shuffle_buffer_length)
     )
 
+    # from the training instances iterator we now return a Dataloader which will
+    # continue to sample random windows for as long as it is called
+    # to return batch_size of the appropriate tensors ready for training!
     return IterableSlice(
-        iter(
-            DataLoader(
-                IterableDataset(training_instances),
-                batch_size=batch_size,
-                **kwargs,
-            )
-        ),
+        iter(DataLoader(IterableDataset(training_instances), batch_size=batch_size, **kwargs)),
         num_batches_per_epoch,
     )
 
 
-def create_validation_data_loader(
-    freq,
-    config,
-    data,
-    batch_size,
-    **kwargs,
-):
+def create_validation_dataloader(freq, config, data, batch_size, **kwargs):
     PREDICTION_INPUT_NAMES = [
-        FieldName.FEAT_STATIC_CAT,
-        FieldName.FEAT_STATIC_REAL,
-        "past_" + FieldName.FEAT_TIME,
-        "past_" + FieldName.TARGET,
-        "past_" + FieldName.OBSERVED_VALUES,
-        "future_" + FieldName.FEAT_TIME,
+        "static_categorical_features",
+        "static_real_features",
+        "past_time_features",
+        "past_values",
+        "past_observed_mask",
+        "future_time_features",
     ]
-
     TRAINING_INPUT_NAMES = PREDICTION_INPUT_NAMES + [
-        "future_" + FieldName.TARGET,
-        "future_" + FieldName.OBSERVED_VALUES,
+        "future_values",
+        "future_observed_mask",
     ]
+
     transformation = create_transformation(freq, config)
     transformed_data = transformation.apply(data, is_train=True)
 
+    # we initialize a Validation instance splitter
     instance_splitter = create_instance_splitter(config, "validation") + SelectFields(TRAINING_INPUT_NAMES)
     validation_instances = instance_splitter.apply(transformed_data, is_train=True)
 
@@ -220,28 +231,25 @@ def create_validation_data_loader(
     )
 
 
-def create_test_data_loader(
-    freq,
-    config,
-    data,
-    batch_size,
-    **kwargs,
-):
+def create_test_dataloader(config: PretrainedConfig, freq, data, batch_size: int, **kwargs):
     PREDICTION_INPUT_NAMES = [
-        FieldName.FEAT_STATIC_CAT,
-        FieldName.FEAT_STATIC_REAL,
-        "past_" + FieldName.FEAT_TIME,
-        "past_" + FieldName.TARGET,
-        "past_" + FieldName.OBSERVED_VALUES,
-        "future_" + FieldName.FEAT_TIME,
+        "static_categorical_features",
+        "static_real_features",
+        "past_time_features",
+        "past_values",
+        "past_observed_mask",
+        "future_time_features",
     ]
+
     transformation = create_transformation(freq, config)
     transformed_data = transformation.apply(data, is_train=False)
+
+    # we create a Test Instance splitter which will sample the very last
+    # context window seen during training only for the encoder.
     instance_splitter = create_instance_splitter(config, "test") + SelectFields(PREDICTION_INPUT_NAMES)
-    test_instances = instance_splitter.apply(transformed_data, is_tran=False)
 
-    return DataLoader(
-        IterableDataset(test_instances),
-        batch_size=batch_size,
-        **kwargs,
-    )
+    # we apply the transformations in test mode
+    testing_instances = instance_splitter.apply(transformed_data, is_train=False)
+
+    # This returns a Dataloader which will go over the dataset once.
+    return DataLoader(IterableDataset(testing_instances), batch_size=batch_size, **kwargs)

From ffb319f20293c0f95cac5496746cec46e038b488 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 12:27:20 +0200
Subject: [PATCH 149/164] make style

---
 examples/pytorch/time-series-prediction/utils_ts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/time-series-prediction/utils_ts.py b/examples/pytorch/time-series-prediction/utils_ts.py
index 0636e170346eb..747a0e2acd893 100644
--- a/examples/pytorch/time-series-prediction/utils_ts.py
+++ b/examples/pytorch/time-series-prediction/utils_ts.py
@@ -34,13 +34,13 @@
     ExpectedNumInstanceSampler,
     InstanceSplitter,
     RemoveFields,
+    RenameFields,
     SelectFields,
     SetField,
     TestSplitSampler,
     Transformation,
     ValidationSplitSampler,
     VstackFeatures,
-    RenameFields,
 )
 from gluonts.transform.sampler import InstanceSampler
 

From 6649b29996ab8a5b2696de05396849ff34beedb2 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 12:36:46 +0200
Subject: [PATCH 150/164] cleanup

---
 .../time-series-prediction/utils_ts.py        | 62 ++++++-------------
 1 file changed, 19 insertions(+), 43 deletions(-)

diff --git a/examples/pytorch/time-series-prediction/utils_ts.py b/examples/pytorch/time-series-prediction/utils_ts.py
index 747a0e2acd893..03ac346dd7c31 100644
--- a/examples/pytorch/time-series-prediction/utils_ts.py
+++ b/examples/pytorch/time-series-prediction/utils_ts.py
@@ -1,5 +1,6 @@
 # coding=utf-8
 # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -45,6 +46,18 @@
 from gluonts.transform.sampler import InstanceSampler
 
 
+PREDICTION_INPUT_NAMES = [
+    "static_categorical_features",
+    "static_real_features",
+    "past_time_features",
+    "past_values",
+    "past_observed_mask",
+    "future_time_features",
+]
+
+TRAIN_VAL_INPUT_NAMES = PREDICTION_INPUT_NAMES + ["future_values", "future_observed_mask"]
+
+
 @lru_cache(10_000)
 def _as_period(val, freq):
     return pd.Period(val, freq)
@@ -104,9 +117,9 @@ def create_transformation(freq: str, config: PretrainedConfig) -> Transformation
                 time_features=time_features_from_frequency_str(freq),
                 pred_length=config.prediction_length,
             ),
-            # step 6: add another temporal feature (just a single number)
+            # step 6: add another temporal feature (just a single number per time step)
             # tells the model where in the life the value of the time series is
-            # sort of running counter
+            # kind of running counter which is log transformed
             AddAgeFeature(
                 target_field=FieldName.TARGET,
                 output_field=FieldName.FEAT_AGE,
@@ -168,22 +181,11 @@ def create_train_dataloader(
     shuffle_buffer_length: Optional[int] = None,
     **kwargs,
 ) -> Iterable:
-    PREDICTION_INPUT_NAMES = [
-        "static_categorical_features",
-        "static_real_features",
-        "past_time_features",
-        "past_values",
-        "past_observed_mask",
-        "future_time_features",
-    ]
-
-    TRAINING_INPUT_NAMES = PREDICTION_INPUT_NAMES + ["future_values", "future_observed_mask"]
-
     transformation = create_transformation(freq, config)
     transformed_data = transformation.apply(data, is_train=True)
 
     # we initialize a Training instance splitter
-    instance_splitter = create_instance_splitter(config, "train") + SelectFields(TRAINING_INPUT_NAMES)
+    instance_splitter = create_instance_splitter(config, "train") + SelectFields(TRAIN_VAL_INPUT_NAMES)
 
     # the instance splitter will sample a window of
     # context length + lags + prediction length (from the transformed time series of a dataset)
@@ -199,48 +201,22 @@ def create_train_dataloader(
     # to return batch_size of the appropriate tensors ready for training!
     return IterableSlice(
         iter(DataLoader(IterableDataset(training_instances), batch_size=batch_size, **kwargs)),
-        num_batches_per_epoch,
+        length=num_batches_per_epoch,
     )
 
 
 def create_validation_dataloader(freq, config, data, batch_size, **kwargs):
-    PREDICTION_INPUT_NAMES = [
-        "static_categorical_features",
-        "static_real_features",
-        "past_time_features",
-        "past_values",
-        "past_observed_mask",
-        "future_time_features",
-    ]
-    TRAINING_INPUT_NAMES = PREDICTION_INPUT_NAMES + [
-        "future_values",
-        "future_observed_mask",
-    ]
-
     transformation = create_transformation(freq, config)
     transformed_data = transformation.apply(data, is_train=True)
 
     # we initialize a Validation instance splitter
-    instance_splitter = create_instance_splitter(config, "validation") + SelectFields(TRAINING_INPUT_NAMES)
+    instance_splitter = create_instance_splitter(config, "validation") + SelectFields(TRAIN_VAL_INPUT_NAMES)
     validation_instances = instance_splitter.apply(transformed_data, is_train=True)
 
-    return DataLoader(
-        IterableDataset(validation_instances),
-        batch_size=batch_size,
-        **kwargs,
-    )
+    return DataLoader(IterableDataset(validation_instances), batch_size=batch_size, **kwargs)
 
 
 def create_test_dataloader(config: PretrainedConfig, freq, data, batch_size: int, **kwargs):
-    PREDICTION_INPUT_NAMES = [
-        "static_categorical_features",
-        "static_real_features",
-        "past_time_features",
-        "past_values",
-        "past_observed_mask",
-        "future_time_features",
-    ]
-
     transformation = create_transformation(freq, config)
     transformed_data = transformation.apply(data, is_train=False)
 

From 006a80281d5e906cb5de4bc53939851ddaf09a65 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 13:42:43 +0200
Subject: [PATCH 151/164] undo position of past_observed_mask

---
 .../modeling_time_series_transformer.py                     | 6 +++---
 .../test_modeling_time_series_transformer.py                | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 928e3fb304a3b..7254415ecac7e 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1592,11 +1592,11 @@ def forward(
         self,
         past_values: torch.Tensor,
         past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
         static_categorical_features: torch.Tensor,
         static_real_features: torch.Tensor,
         future_values: Optional[torch.Tensor] = None,
         future_time_features: Optional[torch.Tensor] = None,
-        past_observed_mask: Optional[torch.Tensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         decoder_head_mask: Optional[torch.Tensor] = None,
@@ -1757,11 +1757,11 @@ def forward(
         self,
         past_values: torch.Tensor,
         past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
         static_categorical_features: torch.Tensor,
         static_real_features: torch.Tensor,
         future_values: Optional[torch.Tensor] = None,
         future_time_features: Optional[torch.Tensor] = None,
-        past_observed_mask: Optional[torch.Tensor] = None,
         future_observed_mask: Optional[torch.Tensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
@@ -1885,8 +1885,8 @@ def generate(
         static_real_features: torch.Tensor,
         past_time_features: torch.Tensor,
         past_values: torch.Tensor,
+        past_observed_mask: torch.Tensor,
         future_time_features: Optional[torch.Tensor],
-        past_observed_mask: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> torch.Tensor:
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 6c6d15daa0ce1..d513f1fe21252 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -227,11 +227,11 @@ def test_forward_signature(self):
             expected_arg_names = [
                 "past_values",
                 "past_time_features",
+                "past_observed_mask",
                 "static_categorical_features",
                 "static_real_features",
                 "future_values",
                 "future_time_features",
-                "past_observed_mask",
             ]
 
             expected_arg_names.extend(

From fcd48cd758599d560badc06aea693b1a45364778 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 13:53:38 +0200
Subject: [PATCH 152/164] fix import

---
 examples/pytorch/time-series-prediction/utils_ts.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/pytorch/time-series-prediction/utils_ts.py b/examples/pytorch/time-series-prediction/utils_ts.py
index 03ac346dd7c31..5861bb48a8e15 100644
--- a/examples/pytorch/time-series-prediction/utils_ts.py
+++ b/examples/pytorch/time-series-prediction/utils_ts.py
@@ -45,6 +45,8 @@
 )
 from gluonts.transform.sampler import InstanceSampler
 
+from .configuration_utils import PretrainedConfig
+
 
 PREDICTION_INPUT_NAMES = [
     "static_categorical_features",

From 4ebc3706020c2620b3537575f03e143f0a97f2bd Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 13:55:46 +0200
Subject: [PATCH 153/164] typo

---
 examples/pytorch/time-series-prediction/utils_ts.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/pytorch/time-series-prediction/utils_ts.py b/examples/pytorch/time-series-prediction/utils_ts.py
index 5861bb48a8e15..c55a0bef5d19f 100644
--- a/examples/pytorch/time-series-prediction/utils_ts.py
+++ b/examples/pytorch/time-series-prediction/utils_ts.py
@@ -44,8 +44,7 @@
     VstackFeatures,
 )
 from gluonts.transform.sampler import InstanceSampler
-
-from .configuration_utils import PretrainedConfig
+from transformer import PretrainedConfig
 
 
 PREDICTION_INPUT_NAMES = [

From 2ab53bcff4f3587e9155480ab7afa779189d76d5 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 13:57:22 +0200
Subject: [PATCH 154/164] more typo

---
 examples/pytorch/time-series-prediction/utils_ts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/time-series-prediction/utils_ts.py b/examples/pytorch/time-series-prediction/utils_ts.py
index c55a0bef5d19f..acb6485f63aa3 100644
--- a/examples/pytorch/time-series-prediction/utils_ts.py
+++ b/examples/pytorch/time-series-prediction/utils_ts.py
@@ -44,7 +44,7 @@
     VstackFeatures,
 )
 from gluonts.transform.sampler import InstanceSampler
-from transformer import PretrainedConfig
+from transformers import PretrainedConfig
 
 
 PREDICTION_INPUT_NAMES = [

From 78cd493eed4cccbd395ac22c86a3588c156095b3 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 14:06:06 +0200
Subject: [PATCH 155/164] rename example files

---
 ...ainer.py => run_time_series_no_trainer.py} | 31 ++++++++++++++++---
 .../{utils_ts.py => utils_time_series.py}     |  0
 2 files changed, 27 insertions(+), 4 deletions(-)
 rename examples/pytorch/time-series-prediction/{run_ts_no_trainer.py => run_time_series_no_trainer.py} (92%)
 rename examples/pytorch/time-series-prediction/{utils_ts.py => utils_time_series.py} (100%)

diff --git a/examples/pytorch/time-series-prediction/run_ts_no_trainer.py b/examples/pytorch/time-series-prediction/run_time_series_no_trainer.py
similarity index 92%
rename from examples/pytorch/time-series-prediction/run_ts_no_trainer.py
rename to examples/pytorch/time-series-prediction/run_time_series_no_trainer.py
index e27f3cf985491..a5d4ee66c279f 100644
--- a/examples/pytorch/time-series-prediction/run_ts_no_trainer.py
+++ b/examples/pytorch/time-series-prediction/run_time_series_no_trainer.py
@@ -55,6 +55,25 @@ def parse_args():
         default="tourism_monthly",
         help="The configuration name of the dataset to use (via the datasets library).",
     )
+    parser.add_argument(
+        "--freq",
+        type=str,
+        default="1M",
+        help="The freq of the dataset.",
+    )
+    parser.add_argument(
+        "--prediction_length",
+        type=int,
+        default=24,
+        help="The prediction length of the dataset.",
+    )
+    parser.add_argument(
+        "--context_length",
+        type=int,
+        default=24 * 3,
+        help="The configuration context_length of the dataset.",
+    )
+
     parser.add_argument(
         "--model_name_or_path",
         type=str,
@@ -82,19 +101,19 @@ def parse_args():
     parser.add_argument(
         "--per_device_train_batch_size",
         type=int,
-        default=8,
+        default=64,
         help="Batch size (per device) for the training dataloader.",
     )
     parser.add_argument(
         "--per_device_eval_batch_size",
         type=int,
-        default=8,
+        default=64,
         help="Batch size (per device) for the evaluation dataloader.",
     )
     parser.add_argument(
         "--learning_rate",
         type=float,
-        default=5e-5,
+        default=5e-4,
         help="Initial learning rate (after the potential warmup period) to use.",
     )
     parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
@@ -202,7 +221,7 @@ def main():
     raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
 
     # config
-    config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
+    config = AutoConfig.from_pretrained(args.model_name_or_path)
 
     # model
     model = TimeSeriesTransformerForPrediction(config)
@@ -212,3 +231,7 @@ def main():
     print(raw_datasets)
     print(model)
     print(repo)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/time-series-prediction/utils_ts.py b/examples/pytorch/time-series-prediction/utils_time_series.py
similarity index 100%
rename from examples/pytorch/time-series-prediction/utils_ts.py
rename to examples/pytorch/time-series-prediction/utils_time_series.py

From 4a7f8c9a1b54884ac16831a716cc64841fe8e98d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 19:14:31 +0200
Subject: [PATCH 156/164] remove example for now

---
 .../time-series-prediction/requirements.txt   |   3 -
 .../run_time_series_no_trainer.py             | 237 ------------------
 .../utils_time_series.py                      | 232 -----------------
 3 files changed, 472 deletions(-)
 delete mode 100644 examples/pytorch/time-series-prediction/requirements.txt
 delete mode 100644 examples/pytorch/time-series-prediction/run_time_series_no_trainer.py
 delete mode 100644 examples/pytorch/time-series-prediction/utils_time_series.py

diff --git a/examples/pytorch/time-series-prediction/requirements.txt b/examples/pytorch/time-series-prediction/requirements.txt
deleted file mode 100644
index eeac4319dcc22..0000000000000
--- a/examples/pytorch/time-series-prediction/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-accelerate
-datasets
-gluonts[torch]
diff --git a/examples/pytorch/time-series-prediction/run_time_series_no_trainer.py b/examples/pytorch/time-series-prediction/run_time_series_no_trainer.py
deleted file mode 100644
index a5d4ee66c279f..0000000000000
--- a/examples/pytorch/time-series-prediction/run_time_series_no_trainer.py
+++ /dev/null
@@ -1,237 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-import argparse
-import logging
-import os
-from pathlib import Path
-
-import datasets
-from datasets import load_dataset
-
-import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository
-from transformers import AutoConfig, SchedulerType, TimeSeriesTransformerForPrediction
-from transformers.utils import get_full_repo_name, send_example_telemetry
-from transformers.utils.versions import require_version
-
-
-""" Training a 🤗 Transformers model for time series prediction"""
-
-
-logger = get_logger(__name__)
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/time-series-prediction/requirements.txt")
-
-
-# Parsing input arguments
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Finetune a transformers model on a probabilistic time series forecasting task"
-    )
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        default="monash_tsf",
-        help="The name of the dataset to use (via the datasets library).",
-    )
-    parser.add_argument(
-        "--dataset_config_name",
-        type=str,
-        default="tourism_monthly",
-        help="The configuration name of the dataset to use (via the datasets library).",
-    )
-    parser.add_argument(
-        "--freq",
-        type=str,
-        default="1M",
-        help="The freq of the dataset.",
-    )
-    parser.add_argument(
-        "--prediction_length",
-        type=int,
-        default=24,
-        help="The prediction length of the dataset.",
-    )
-    parser.add_argument(
-        "--context_length",
-        type=int,
-        default=24 * 3,
-        help="The configuration context_length of the dataset.",
-    )
-
-    parser.add_argument(
-        "--model_name_or_path",
-        type=str,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-        required=False,
-    )
-    parser.add_argument(
-        "--config_name",
-        type=str,
-        default=None,
-        help="Pretrained config name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--prediction_length",
-        type=int,
-        default=None,
-        help="The prediction horizon of the time series forecasting task.",
-    )
-    parser.add_argument(
-        "--freq",
-        type=str,
-        default=None,
-        help="The frequency of the time series.",
-    )
-    parser.add_argument(
-        "--per_device_train_batch_size",
-        type=int,
-        default=64,
-        help="Batch size (per device) for the training dataloader.",
-    )
-    parser.add_argument(
-        "--per_device_eval_batch_size",
-        type=int,
-        default=64,
-        help="Batch size (per device) for the evaluation dataloader.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-4,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
-    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--lr_scheduler_type",
-        type=SchedulerType,
-        default="linear",
-        help="The scheduler type to use.",
-        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
-    )
-    parser.add_argument(
-        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument(
-        "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
-    )
-    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--with_tracking",
-        action="store_true",
-        help="Whether to enable experiment trackers for logging.",
-    )
-    parser.add_argument(
-        "--report_to",
-        type=str,
-        default="all",
-        help=(
-            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
-            "Only applicable when `--with_tracking` is passed."
-        ),
-    )
-    args = parser.parse_args()
-
-    return args
-
-
-def main():
-    args = parse_args()
-
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    send_example_telemetry("run_ts_no_trainer", args)
-
-    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
-    # in the environment
-    accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
-
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state, main_process_only=False)
-    if accelerator.is_local_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Handle the repository creation
-    if accelerator.is_main_process:
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-        elif args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-    accelerator.wait_for_everyone()
-
-    # Get the datasets
-    raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
-
-    # config
-    config = AutoConfig.from_pretrained(args.model_name_or_path)
-
-    # model
-    model = TimeSeriesTransformerForPrediction(config)
-
-    # just printing for now to make sure quality passes
-    print(args)
-    print(raw_datasets)
-    print(model)
-    print(repo)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/pytorch/time-series-prediction/utils_time_series.py b/examples/pytorch/time-series-prediction/utils_time_series.py
deleted file mode 100644
index acb6485f63aa3..0000000000000
--- a/examples/pytorch/time-series-prediction/utils_time_series.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Transformations Utilities for Time Series Transformers. """
-
-from functools import lru_cache
-from typing import Iterable, Optional
-
-import numpy as np
-import pandas as pd
-from torch.utils.data import DataLoader
-
-from gluonts.dataset.field_names import FieldName
-from gluonts.itertools import Cyclic, IterableSlice, PseudoShuffled
-from gluonts.time_feature import time_features_from_frequency_str
-from gluonts.torch.util import IterableDataset
-from gluonts.transform import (
-    AddAgeFeature,
-    AddObservedValuesIndicator,
-    AddTimeFeatures,
-    AsNumpyArray,
-    Chain,
-    ExpectedNumInstanceSampler,
-    InstanceSplitter,
-    RemoveFields,
-    RenameFields,
-    SelectFields,
-    SetField,
-    TestSplitSampler,
-    Transformation,
-    ValidationSplitSampler,
-    VstackFeatures,
-)
-from gluonts.transform.sampler import InstanceSampler
-from transformers import PretrainedConfig
-
-
-PREDICTION_INPUT_NAMES = [
-    "static_categorical_features",
-    "static_real_features",
-    "past_time_features",
-    "past_values",
-    "past_observed_mask",
-    "future_time_features",
-]
-
-TRAIN_VAL_INPUT_NAMES = PREDICTION_INPUT_NAMES + ["future_values", "future_observed_mask"]
-
-
-@lru_cache(10_000)
-def _as_period(val, freq):
-    return pd.Period(val, freq)
-
-
-def transform_data(batch, freq, log1p_transform=False):
-    batch[FieldName.START] = [_as_period(entry, freq) for entry in batch[FieldName.START]]
-    if log1p_transform:
-        batch[FieldName.TARGET] = np.log1p(batch[FieldName.TARGET])
-    return batch
-
-
-def create_transformation(freq: str, config: PretrainedConfig) -> Transformation:
-    remove_field_names = []
-    if config.num_static_real_features == 0:
-        remove_field_names.append(FieldName.FEAT_STATIC_REAL)
-    if config.num_dynamic_real_features == 0:
-        remove_field_names.append(FieldName.FEAT_DYNAMIC_REAL)
-
-    # a bit like torchvision.transforms.Compose
-    return Chain(
-        # step 1: remove static/dynamic fields if not specified
-        [RemoveFields(field_names=remove_field_names)]
-        # step 2: use static features if available, if not add dummy values
-        + (
-            [SetField(output_field=FieldName.FEAT_STATIC_CAT, value=[0])]
-            if not config.num_static_categorical_features > 0
-            else []
-        )
-        + (
-            [SetField(output_field=FieldName.FEAT_STATIC_REAL, value=[0.0])]
-            if not config.num_static_real_features > 0
-            else []
-        )
-        # step 3: convert the data to NumPy (potentially not needed)
-        + [
-            AsNumpyArray(field=FieldName.FEAT_STATIC_CAT, expected_ndim=1, dtype=int),
-            AsNumpyArray(field=FieldName.FEAT_STATIC_REAL, expected_ndim=1),
-            AsNumpyArray(
-                field=FieldName.TARGET,
-                # in the following line, we add 1 for the time dimension
-                expected_ndim=config.input_size,
-            ),
-            # step 4: handle the NaN's by filling in the target with zero
-            # and return the mask (which is in the observed values)
-            # true for observed values, false for nan's
-            # the decoder uses this mask (no loss is incurred for unobserved values)
-            # see loss_weights inside the xxxForPrediction model
-            AddObservedValuesIndicator(target_field=FieldName.TARGET, output_field=FieldName.OBSERVED_VALUES),
-            # step 5: add temporal features based on freq of the dataset
-            # month of year in this case
-            # these serve as positional encodings
-            AddTimeFeatures(
-                start_field=FieldName.START,
-                target_field=FieldName.TARGET,
-                output_field=FieldName.FEAT_TIME,
-                time_features=time_features_from_frequency_str(freq),
-                pred_length=config.prediction_length,
-            ),
-            # step 6: add another temporal feature (just a single number per time step)
-            # tells the model where in the life the value of the time series is
-            # kind of running counter which is log transformed
-            AddAgeFeature(
-                target_field=FieldName.TARGET,
-                output_field=FieldName.FEAT_AGE,
-                pred_length=config.prediction_length,
-                log_scale=True,
-            ),
-            # step 7: vertically stack all the temporal features
-            VstackFeatures(
-                output_field=FieldName.FEAT_TIME,
-                input_fields=[FieldName.FEAT_TIME, FieldName.FEAT_AGE]
-                + ([FieldName.FEAT_DYNAMIC_REAL] if config.num_dynamic_real_features > 0 else []),
-            ),
-            # step 8: rename to match HuggingFace names
-            RenameFields(
-                mapping={
-                    FieldName.FEAT_STATIC_CAT: "static_categorical_features",
-                    FieldName.FEAT_STATIC_REAL: "static_real_features",
-                    FieldName.FEAT_TIME: "time_features",
-                    FieldName.TARGET: "values",
-                    FieldName.OBSERVED_VALUES: "observed_mask",
-                }
-            ),
-        ]
-    )
-
-
-def create_instance_splitter(
-    config: PretrainedConfig,
-    mode: str,
-    train_sampler: Optional[InstanceSampler] = None,
-    validation_sampler: Optional[InstanceSampler] = None,
-) -> Transformation:
-    assert mode in ["train", "validation", "test"]
-
-    instance_sampler = {
-        "train": train_sampler or ExpectedNumInstanceSampler(num_instances=1.0, min_future=config.prediction_length),
-        "validation": validation_sampler or ValidationSplitSampler(min_future=config.prediction_length),
-        "test": TestSplitSampler(),
-    }[mode]
-
-    return InstanceSplitter(
-        target_field="values",
-        is_pad_field=FieldName.IS_PAD,
-        start_field=FieldName.START,
-        forecast_start_field=FieldName.FORECAST_START,
-        instance_sampler=instance_sampler,
-        past_length=config.context_length + max(config.lags_sequence),
-        future_length=config.prediction_length,
-        time_series_fields=["time_features", "observed_mask"],
-    )
-
-
-def create_train_dataloader(
-    config: PretrainedConfig,
-    freq,
-    data,
-    batch_size: int,
-    num_batches_per_epoch: int,
-    shuffle_buffer_length: Optional[int] = None,
-    **kwargs,
-) -> Iterable:
-    transformation = create_transformation(freq, config)
-    transformed_data = transformation.apply(data, is_train=True)
-
-    # we initialize a Training instance splitter
-    instance_splitter = create_instance_splitter(config, "train") + SelectFields(TRAIN_VAL_INPUT_NAMES)
-
-    # the instance splitter will sample a window of
-    # context length + lags + prediction length (from the transformed time series of a dataset)
-    # randomly from within the values of the time series and return another iterator.
-    training_instances = instance_splitter.apply(
-        Cyclic(transformed_data)
-        if shuffle_buffer_length is None
-        else PseudoShuffled(Cyclic(transformed_data), shuffle_buffer_length=shuffle_buffer_length)
-    )
-
-    # from the training instances iterator we now return a Dataloader which will
-    # continue to sample random windows for as long as it is called
-    # to return batch_size of the appropriate tensors ready for training!
-    return IterableSlice(
-        iter(DataLoader(IterableDataset(training_instances), batch_size=batch_size, **kwargs)),
-        length=num_batches_per_epoch,
-    )
-
-
-def create_validation_dataloader(freq, config, data, batch_size, **kwargs):
-    transformation = create_transformation(freq, config)
-    transformed_data = transformation.apply(data, is_train=True)
-
-    # we initialize a Validation instance splitter
-    instance_splitter = create_instance_splitter(config, "validation") + SelectFields(TRAIN_VAL_INPUT_NAMES)
-    validation_instances = instance_splitter.apply(transformed_data, is_train=True)
-
-    return DataLoader(IterableDataset(validation_instances), batch_size=batch_size, **kwargs)
-
-
-def create_test_dataloader(config: PretrainedConfig, freq, data, batch_size: int, **kwargs):
-    transformation = create_transformation(freq, config)
-    transformed_data = transformation.apply(data, is_train=False)
-
-    # we create a Test Instance splitter which will sample the very last
-    # context window seen during training only for the encoder.
-    instance_splitter = create_instance_splitter(config, "test") + SelectFields(PREDICTION_INPUT_NAMES)
-
-    # we apply the transformations in test mode
-    testing_instances = instance_splitter.apply(transformed_data, is_train=False)
-
-    # This returns a Dataloader which will go over the dataset once.
-    return DataLoader(IterableDataset(testing_instances), batch_size=batch_size, **kwargs)

From cb6635782f0000bb63848edffbddb82ebd0f716e Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 19:47:19 +0200
Subject: [PATCH 157/164] Update docs/source/en/_toctree.yml

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/en/_toctree.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index a6441f98ca18d..9fc53c64a0c0e 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -494,7 +494,7 @@
       title: Reinforcement learning models
     - isExpanded: false
       sections:
-      - local: model_doc/time_series_trasnformer
+      - local: model_doc/time_series_transformer
         title: Time Series Transformer
       title: Time series models
     title: Models

From 4fe63feb4bf94034f09c5734784be47f2f3e0e6b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 19:50:49 +0200
Subject: [PATCH 158/164] Update
 src/transformers/models/time_series_transformer/configuration_time_series_transformer.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../configuration_time_series_transformer.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 36b960439eff4..b83583e3b0f36 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -58,7 +58,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             multivarate targets.
         scaling (`bool`, *optional* defaults to `True`):
             Whether to scale the input targets.
-        lags_sequence (`list[int]`, *optional*, defaults to [1, 2, 3, 4, 5, 6, 7]):
+        lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
             The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
             5, 6, 7]`.
         num_time_features (`int`, *optional*, defaults to 0):

From b28a2f9b43dc172cab045995b57e904f1a7007b1 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 20:13:14 +0200
Subject: [PATCH 159/164] Update
 src/transformers/models/time_series_transformer/modeling_time_series_transformer.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../time_series_transformer/modeling_time_series_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 7254415ecac7e..faec5812d1757 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1029,7 +1029,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             - 0 indicates the head is **masked**.
 
         encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
-            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
             `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
             hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):

From 8a3e4304c87037f12f0783712144e56ccffcb35f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 20:14:39 +0200
Subject: [PATCH 160/164] Update
 src/transformers/models/time_series_transformer/modeling_time_series_transformer.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../modeling_time_series_transformer.py                      | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index faec5812d1757..6d1979de01816 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1042,8 +1042,9 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
-            shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids`
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids`
             you can choose to directly pass an embedded representation. This is useful if you want more control over
             how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup
             matrix.

From f7b0ab5b140788fae87c7feed92b76b6a6e43c70 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 20:28:03 +0200
Subject: [PATCH 161/164] Update modeling_time_series_transformer.py

fix style
---
 .../modeling_time_series_transformer.py                  | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 6d1979de01816..7a7585ff658f1 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1030,7 +1030,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
         encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
             Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
-            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
             hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
@@ -1044,10 +1044,9 @@ def _set_gradient_checkpointing(self, module, value=False):
             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
             `decoder_input_ids` of shape `(batch_size, sequence_length)`.
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids`
-            you can choose to directly pass an embedded representation. This is useful if you want more control over
-            how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup
-            matrix.
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
 
         use_cache (`bool`, *optional*):
             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see

From 4fe8164fb926e20b4153c6a97f784a8fc33d387a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 20:32:54 +0200
Subject: [PATCH 162/164] fixed typo

---
 .../configuration_time_series_transformer.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index b83583e3b0f36..3e16ba8b10962 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -44,7 +44,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
 
     Args:
         prediction_length (`int`):
-            The prediction length for the decoder. In other words, the prediction horizion of the model.
+            The prediction length for the decoder. In other words, the prediction horizon of the model.
         context_length (`int`, *optional*, defaults to `prediction_length`):
             The context length for the encoder. If `None`, the context length will be the same as the
             `prediction_length`.

From 48a2be458aecfc5291bffe3e2d1b9b9895b7a03d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 20:45:17 +0200
Subject: [PATCH 163/164] fix typo and grammer

---
 .../configuration_time_series_transformer.py                  | 4 ++--
 .../modeling_time_series_transformer.py                       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 3e16ba8b10962..23564f3e0efc4 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -178,7 +178,7 @@ def __init__(
         if cardinality and num_static_categorical_features > 0:
             if len(cardinality) != num_static_categorical_features:
                 raise ValueError(
-                    "The cardinality should be a list having the same length as `num_static_categorical_features`"
+                    "The cardinality should be a list of the same length as `num_static_categorical_features`"
                 )
             self.cardinality = cardinality
         else:
@@ -186,7 +186,7 @@ def __init__(
         if embedding_dimension and num_static_categorical_features > 0:
             if len(embedding_dimension) != num_static_categorical_features:
                 raise ValueError(
-                    "The embedding dimension should be a list having the same length as"
+                    "The embedding dimension should be a list of the same length as"
                     " `num_static_categorical_features`"
                 )
             self.embedding_dimension = embedding_dimension
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 7a7585ff658f1..bf39ae17564d4 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -936,7 +936,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Past values of the time series, that serve as context in order to predict the future. These values may
             contain lags, i.e. additional values from the past which are added in order to serve as "extra context".
             The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
-            `static_categorical_features`, `static_real_features`, `past_time_featuresuresures`).
+            `static_categorical_features`, `static_real_features`, `past_time_features`).
 
             The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
 

From 43dd269fb4810314e53f5d58de1d6f8509f38dd0 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Sep 2022 20:45:59 +0200
Subject: [PATCH 164/164] fix style

---
 .../configuration_time_series_transformer.py                   | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 23564f3e0efc4..f42da86b55193 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -186,8 +186,7 @@ def __init__(
         if embedding_dimension and num_static_categorical_features > 0:
             if len(embedding_dimension) != num_static_categorical_features:
                 raise ValueError(
-                    "The embedding dimension should be a list of the same length as"
-                    " `num_static_categorical_features`"
+                    "The embedding dimension should be a list of the same length as `num_static_categorical_features`"
                 )
             self.embedding_dimension = embedding_dimension
         else: